true_url 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 496c04ece239c5302c9548b0a4b7d93630125172
4
+ data.tar.gz: 74c68ce403f401579bbc9b354b283595b5516302
5
+ SHA512:
6
+ metadata.gz: cfbf6e4099c911e66ff6061ecd98b0e98c60b8411659904d57d76c9bf1b4a2da207f0a19b9f11597ff8bf22a8ae716a7d49706b0a46730b6e824c99e970542cd
7
+ data.tar.gz: c186b1d04c781de99d61d64b9b3052e2f667ba140cba10c05db5e51ab0b43beddc0ee20d2318a534b84ed7ac8019294949a4fd18bcfe50a4492230431e13db6a
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
15
+ build.bat
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --format doc
3
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in true_url.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Jonathan Wong
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1 @@
1
+ # true_url
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require 'rspec/core/rake_task'
2
+
3
+ RSpec::Core::RakeTask.new :spec
4
+
5
+ task default: :spec
@@ -0,0 +1,39 @@
1
+ class TrueURL
2
+ class Context
3
+ attr_reader :original_url, :options, :attributes, :working_url
4
+
5
+ def initialize(original_url, options)
6
+ @original_url = parse(original_url)
7
+ @options = options
8
+ @finalized = false
9
+ @attributes = {}
10
+
11
+ set_working_url(original_url)
12
+ end
13
+
14
+ def set_working_url(url, base_url = nil)
15
+ @working_url = base_url.nil? ? parse(url) : parse(base_url).join(parse(url))
16
+
17
+ # If the URL has no scheme, then we assume HTTP
18
+ if @working_url.scheme.nil?
19
+ @working_url = url.to_s.start_with?('//') ? parse("http:#{url}") : parse("http://#{url}")
20
+ end
21
+
22
+ @working_url.normalize
23
+ end
24
+
25
+ def finalize
26
+ @finalized = true
27
+ end
28
+
29
+ def finalized?
30
+ @finalized
31
+ end
32
+
33
+ private
34
+
35
+ def parse(url)
36
+ (url.is_a? Addressable::URI) ? url : Addressable::URI.parse(url)
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,56 @@
1
+ class TrueURL
2
+ module Strategy
3
+ class DailyMotion
4
+ def execute(context)
5
+ path = context.working_url.path
6
+
7
+ if context.working_url.host == 'dai.ly'
8
+ video_id = path[1..-1]
9
+
10
+ elsif path[0..6] == '/video/'
11
+ video_id = clean_video_id(path)
12
+
13
+ elsif path[0..6] == '/embed/'
14
+ video_id = path[13..-1]
15
+
16
+ elsif path[0..9] == '/playlist/'
17
+ playlist_id = clean_playlist_id(path)
18
+ end
19
+
20
+ unless video_id.nil?
21
+ context.set_working_url("https://www.dailymotion.com/video/#{video_id}")
22
+ context.finalize
23
+ context.attributes[:embed_url] = "https://www.dailymotion.com/embed/video/#{video_id}"
24
+ end
25
+
26
+ unless playlist_id.nil?
27
+ context.set_working_url("https://www.dailymotion.com/playlist/#{playlist_id}")
28
+ context.finalize
29
+ end
30
+
31
+ # DailyMotion supports both HTTP and HTTPS and doesn't redirect between them, so we prefer HTTPS
32
+ context.working_url.scheme = 'https'
33
+ end
34
+
35
+ def clean_video_id(path)
36
+ if path.index('_')
37
+ path[7..path.index('_') - 1]
38
+ else
39
+ path[7..-1]
40
+ end
41
+ end
42
+
43
+ def clean_playlist_id(path)
44
+ cpath = path[10..-1]
45
+
46
+ if cpath.index('_')
47
+ cpath[0..cpath.index('_') - 1]
48
+ elsif cpath.index('/')
49
+ cpath[0..cpath.index('/') - 1]
50
+ else
51
+ cpath[0..-1]
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,25 @@
1
+ class TrueURL
2
+ module Strategy
3
+ class NicoVideo
4
+ def execute(context)
5
+ path = context.working_url.path
6
+
7
+ if path[0..6] == '/watch/'
8
+ video_id = path.split('/')[2]
9
+
10
+ elsif path[0..12] == '/thumb_watch/'
11
+ video_id = path.split('/')[2]
12
+ end
13
+
14
+ if video_id
15
+ context.set_working_url("http://www.nicovideo.jp/watch/#{video_id}")
16
+ context.finalize
17
+ context.attributes[:embed_url] = "http://embed.nicovideo.jp/watch/#{video_id}"
18
+ end
19
+
20
+ # Nico Video only supports HTTP
21
+ context.working_url.scheme = 'http'
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,25 @@
1
+ class TrueURL
2
+ module Strategy
3
+ class Twitter
4
+ def execute(context)
5
+ fragment = context.working_url.fragment
6
+
7
+ # Special handling to collapse Twitter hashbang (#!) URLs
8
+ unless fragment.nil?
9
+ if fragment.start_with?('!/')
10
+ context.working_url.path = fragment[1..-1]
11
+ context.working_url.fragment = nil
12
+ end
13
+ end
14
+
15
+ path = context.working_url.path
16
+
17
+ if path =~ /^\/\w+\/status\/\d+/
18
+ parts = path.split('/')
19
+ context.set_working_url("https://twitter.com/#{parts[1].downcase}/status/#{parts[3]}")
20
+ context.finalize
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,28 @@
1
+ class TrueURL
2
+ module Strategy
3
+ class Vimeo
4
+ def execute(context)
5
+ path = context.working_url.path
6
+
7
+ if context.working_url.host == 'player.vimeo.com'
8
+ video_id = path[7..-1]
9
+
10
+ elsif path =~ /^\/channels\/\w+\/\d+$/
11
+ video_id = path.split('/').last
12
+
13
+ elsif path =~ /^\/\d+$/
14
+ video_id = path[1..-1]
15
+ end
16
+
17
+ if video_id
18
+ context.set_working_url("https://vimeo.com/#{video_id}")
19
+ context.finalize
20
+ context.attributes[:embed_url] = "https://player.vimeo.com/video/#{video_id}"
21
+ end
22
+
23
+ # Vimeo supports both HTTP and HTTPS and doesn't redirect between them, so we prefer HTTPS
24
+ context.working_url.scheme = 'https'
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,39 @@
1
+ class TrueURL
2
+ module Strategy
3
+ class YouTube
4
+ def execute(context)
5
+ path = context.working_url.path
6
+
7
+ if context.working_url.host == 'youtu.be'
8
+ video_id = path[1..-1]
9
+
10
+ elsif path == '/watch'
11
+ video_id = context.working_url.query_values['v']
12
+
13
+ elsif path == '/playlist'
14
+ playlist_id = context.working_url.query_values['list']
15
+
16
+ elsif path[0..17] == '/embed/videoseries'
17
+ playlist_id = context.working_url.query_values['list']
18
+
19
+ elsif path[0..6] == '/embed/'
20
+ video_id = path[7..-1]
21
+ end
22
+
23
+ unless video_id.nil?
24
+ context.set_working_url("https://www.youtube.com/watch?v=#{video_id}")
25
+ context.finalize
26
+ context.attributes[:embed_url] = "https://www.youtube.com/embed/#{video_id}"
27
+ context.attributes[:embed_url_private] = "https://www.youtube-nocookie.com/embed/#{video_id}"
28
+ end
29
+
30
+ unless playlist_id.nil?
31
+ context.set_working_url("https://www.youtube.com/playlist?list=#{playlist_id}")
32
+ context.finalize
33
+ context.attributes[:embed_url] = "https://www.youtube.com/embed/videoseries?list=#{playlist_id}"
34
+ context.attributes[:embed_url_private] = "https://www.youtube-nocookie.com/embed/videoseries?list=#{playlist_id}"
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,23 @@
1
+ require 'true_url/strategy/dailymotion'
2
+ require 'true_url/strategy/nicovideo'
3
+ require 'true_url/strategy/twitter'
4
+ require 'true_url/strategy/vimeo'
5
+ require 'true_url/strategy/youtube'
6
+
7
+ class TrueURL
8
+ module Strategy
9
+ def self.default_list
10
+ [
11
+ [/youtube.com$/, TrueURL::Strategy::YouTube.new],
12
+ [/youtube-nocookie.com$/, TrueURL::Strategy::YouTube.new],
13
+ [/^youtu.be$/, TrueURL::Strategy::YouTube.new],
14
+ [/dailymotion.com$/, TrueURL::Strategy::DailyMotion.new],
15
+ [/^dai.ly$/, TrueURL::Strategy::DailyMotion.new],
16
+ [/vimeo.com$/, TrueURL::Strategy::Vimeo.new],
17
+ [/nicovideo.jp$/, TrueURL::Strategy::NicoVideo.new],
18
+ [/^nico.ms$/, TrueURL::Strategy::NicoVideo.new],
19
+ [/twitter.com$/, TrueURL::Strategy::Twitter.new]
20
+ ]
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,3 @@
1
+ class TrueURL
2
+ VERSION = '0.0.1'.freeze
3
+ end
data/lib/true_url.rb ADDED
@@ -0,0 +1,140 @@
1
+ require 'addressable/uri'
2
+
3
+ class TrueURL
4
+ autoload :Version, 'true_url/version'
5
+ autoload :Context, 'true_url/context'
6
+ autoload :Strategy, 'true_url/strategy'
7
+
8
+ attr_accessor :context, :strategies
9
+
10
+ OPTIONS = {
11
+ scheme_override: nil, # Possible choices: "https", "http", nil (preserve scheme)
12
+ fetch: true # Whether to fetch the URL
13
+ }.freeze
14
+
15
+ QUERY_VALUES_TO_REMOVE = %w(
16
+ utm_source
17
+ utm_medium
18
+ utm_term
19
+ utm_content
20
+ utm_campaign
21
+ sms_ss
22
+ awesm
23
+ xtor
24
+ PHPSESSID
25
+ ).freeze
26
+
27
+ def initialize(url, options = {})
28
+ @context = TrueURL::Context.new(url, OPTIONS.merge(options))
29
+ @strategies = TrueURL::Strategy.default_list
30
+ @executed = false
31
+ end
32
+
33
+ def canonical
34
+ execute
35
+ @context.working_url.to_s
36
+ end
37
+
38
+ def attributes
39
+ execute
40
+ @context.attributes
41
+ end
42
+
43
+ private
44
+
45
+ def execute
46
+ return if @executed
47
+
48
+ execute_strategies
49
+
50
+ unless @context.finalized?
51
+ if attempt_fetch?
52
+ fetch
53
+ execute_strategies
54
+ end
55
+ end
56
+
57
+ scheme_override
58
+ remove_fragments
59
+ clean_query_values
60
+
61
+ @executed = true
62
+ end
63
+
64
+ def execute_strategies
65
+ @strategies.each do |s|
66
+ match_criteria = s[0]
67
+ strategy = s[1]
68
+
69
+ strategy.execute(@context) unless @context.finalized? || !strategy_match?(match_criteria)
70
+ end
71
+ end
72
+
73
+ def strategy_match?(match_criteria)
74
+ return true if match_criteria.nil?
75
+
76
+ host = @context.working_url.host
77
+ host.nil? ? false : host.match(match_criteria)
78
+ end
79
+
80
+ def attempt_fetch?
81
+ return false unless @context.options[:fetch]
82
+
83
+ # Must at least have a host, otherwise we can't find the site to crawl
84
+ return false if @context.working_url.host.nil?
85
+
86
+ # We only support HTTP or HTTPS
87
+ %w(http https).include?(@context.working_url.scheme)
88
+ end
89
+
90
+ def fetch
91
+ require 'http' unless defined? HTTP
92
+
93
+ starting_url = @context.working_url
94
+
95
+ response = HTTP.follow
96
+ .get(starting_url)
97
+
98
+ canonical_url = find_canonical_header(response.headers) || find_canonical_url(response.to_s) || response.uri
99
+ @context.set_working_url(canonical_url, starting_url)
100
+ end
101
+
102
+ def find_canonical_header(headers)
103
+ return if headers['Link'].nil?
104
+
105
+ links = (headers['Link'].is_a? String) ? [headers['Link']] : headers['Link']
106
+ links.each { |link| return link.split(/[<>;]/)[1] if link.end_with?('rel="canonical"') }
107
+ nil
108
+ end
109
+
110
+ def find_canonical_url(html)
111
+ require 'nokogiri' unless defined? Nokogiri::HTML
112
+
113
+ doc = Nokogiri::HTML(html)
114
+
115
+ elem = doc.at('link[rel="canonical"]')
116
+ canonical_url = elem['href'] unless elem.nil?
117
+
118
+ elem = doc.at('meta[property="og:url"]')
119
+ og_url = elem['content'] unless elem.nil?
120
+
121
+ canonical_url || og_url
122
+ end
123
+
124
+ def scheme_override
125
+ @context.working_url.scheme = @context.options[:scheme_override] unless @context.options[:scheme_override].nil?
126
+ end
127
+
128
+ def remove_fragments
129
+ @context.working_url.fragment = nil
130
+ end
131
+
132
+ def clean_query_values
133
+ query_values = @context.working_url.query_values
134
+
135
+ unless query_values.nil?
136
+ QUERY_VALUES_TO_REMOVE.each { |p| query_values.delete(p) }
137
+ @context.working_url.query_values = query_values.empty? ? nil : query_values
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,96 @@
1
+ require 'true_url'
2
+
3
+ # This file was generated by the `rspec --init` command. Conventionally, all
4
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
5
+ # The generated `.rspec` file contains `--require spec_helper` which will cause
6
+ # this file to always be loaded, without a need to explicitly require it in any
7
+ # files.
8
+ #
9
+ # Given that it is always loaded, you are encouraged to keep this file as
10
+ # light-weight as possible. Requiring heavyweight dependencies from this file
11
+ # will add to the boot time of your test suite on EVERY test run, even for an
12
+ # individual file that may not need all of that loaded. Instead, consider making
13
+ # a separate helper file that requires the additional dependencies and performs
14
+ # the additional setup, and require it from the spec files that actually need
15
+ # it.
16
+ #
17
+ # The `.rspec` file also contains a few flags that are not defaults but that
18
+ # users commonly want.
19
+ #
20
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
21
+ RSpec.configure do |config|
22
+ # rspec-expectations config goes here. You can use an alternate
23
+ # assertion/expectation library such as wrong or the stdlib/minitest
24
+ # assertions if you prefer.
25
+ config.expect_with :rspec do |expectations|
26
+ # This option will default to `true` in RSpec 4. It makes the `description`
27
+ # and `failure_message` of custom matchers include text for helper methods
28
+ # defined using `chain`, e.g.:
29
+ # be_bigger_than(2).and_smaller_than(4).description
30
+ # # => "be bigger than 2 and smaller than 4"
31
+ # ...rather than:
32
+ # # => "be bigger than 2"
33
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
34
+ end
35
+
36
+ # rspec-mocks config goes here. You can use an alternate test double
37
+ # library (such as bogus or mocha) by changing the `mock_with` option here.
38
+ config.mock_with :rspec do |mocks|
39
+ # Prevents you from mocking or stubbing a method that does not exist on
40
+ # a real object. This is generally recommended, and will default to
41
+ # `true` in RSpec 4.
42
+ mocks.verify_partial_doubles = true
43
+ end
44
+
45
+ # The settings below are suggested to provide a good initial experience
46
+ # with RSpec, but feel free to customize to your heart's content.
47
+ # # These two settings work together to allow you to limit a spec run
48
+ # # to individual examples or groups you care about by tagging them with
49
+ # # `:focus` metadata. When nothing is tagged with `:focus`, all examples
50
+ # # get run.
51
+ # config.filter_run :focus
52
+ # config.run_all_when_everything_filtered = true
53
+ #
54
+ # # Allows RSpec to persist some state between runs in order to support
55
+ # # the `--only-failures` and `--next-failure` CLI options. We recommend
56
+ # # you configure your source control system to ignore this file.
57
+ # config.example_status_persistence_file_path = "spec/examples.txt"
58
+ #
59
+ # # Limits the available syntax to the non-monkey patched syntax that is
60
+ # # recommended. For more details, see:
61
+ # # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
62
+ # # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
63
+ # # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
64
+ # config.disable_monkey_patching!
65
+ #
66
+ # # This setting enables warnings. It's recommended, but in some cases may
67
+ # # be too noisy due to issues in dependencies.
68
+ # config.warnings = true
69
+ #
70
+ # # Many RSpec users commonly either run the entire suite or an individual
71
+ # # file, and it's useful to allow more verbose output when running an
72
+ # # individual spec file.
73
+ # if config.files_to_run.one?
74
+ # # Use the documentation formatter for detailed output,
75
+ # # unless a formatter has already been configured
76
+ # # (e.g. via a command-line flag).
77
+ # config.default_formatter = 'doc'
78
+ # end
79
+ #
80
+ # # Print the 10 slowest examples and example groups at the
81
+ # # end of the spec run, to help surface which specs are running
82
+ # # particularly slow.
83
+ # config.profile_examples = 10
84
+ #
85
+ # # Run specs in random order to surface order dependencies. If you find an
86
+ # # order dependency and want to debug it, you can fix the order by providing
87
+ # # the seed, which is printed after each run.
88
+ # # --seed 1234
89
+ # config.order = :random
90
+ #
91
+ # # Seed global randomization in this process using the `--seed` CLI option.
92
+ # # Setting this allows you to use `--seed` to deterministically reproduce
93
+ # # test failures related to randomization by passing the same `--seed` value
94
+ # # as the one that triggered the failure.
95
+ # Kernel.srand config.seed
96
+ end
@@ -0,0 +1,204 @@
1
+ require 'spec_helper'
2
+
3
+ def gc(unclean_url, options = {})
4
+ TrueURL.new(unclean_url, options).canonical
5
+ end
6
+
7
+ describe TrueURL do
8
+ describe 'YouTube' do
9
+ it 'supports direct video links' do
10
+ t = 'https://www.youtube.com/watch?v=RDocnbkHjhI'
11
+ expect(gc('https://www.youtube.com/watch?v=RDocnbkHjhI')).to eq t
12
+ expect(gc('https://www.youtube.com/watch?v=RDocnbkHjhI&feature=youtu.be&list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
13
+ expect(gc('https://youtu.be/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
14
+ end
15
+
16
+ it 'supports embedded video links' do
17
+ t = 'https://www.youtube.com/watch?v=RDocnbkHjhI'
18
+ expect(gc('https://www.youtube.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
19
+ expect(gc('https://www.youtube-nocookie.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0&amp;controls=0&amp;showinfo=0')).to eq t
20
+ end
21
+
22
+ it 'supports direct playlist links' do
23
+ t = 'https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
24
+ expect(gc('https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
25
+ expect(gc('https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
26
+ end
27
+
28
+ it 'supports embedded playlist links' do
29
+ t = 'https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
30
+ expect(gc('https://www.youtube.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
31
+ expect(gc('https://www.youtube-nocookie.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
32
+ end
33
+
34
+ it 'supports direct channel links' do
35
+ t = 'https://www.youtube.com/user/WatchMojo'
36
+ expect(gc('https://www.youtube.com/channel/UCaWd5_7JhbQBe4dknZhsHJg')).to eq t
37
+ end
38
+
39
+ it 'supports retrieving embed links as attributes' do
40
+ x = TrueURL.new('https://www.youtube-nocookie.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')
41
+ expect(x.attributes[:embed_url]).to eq 'https://www.youtube.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
42
+
43
+ x = TrueURL.new('https://www.youtube.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0&amp;controls=0&amp;showinfo=0')
44
+ expect(x.attributes[:embed_url_private]).to eq 'https://www.youtube-nocookie.com/embed/RDocnbkHjhI'
45
+ end
46
+ end
47
+
48
+ describe 'DailyMotion' do
49
+ it 'supports direct video links' do
50
+ t = 'https://www.dailymotion.com/video/x2k01a9'
51
+ expect(gc('http://dai.ly/x2k01a9')).to eq t
52
+ expect(gc('http://www.dailymotion.com/video/x2k01a9_battlefield-what-s-it-like-to-be-in-a-real-life-video-game_fun')).to eq t
53
+ end
54
+
55
+ it 'supports embedded video links' do
56
+ t = 'https://www.dailymotion.com/video/x2k01a9'
57
+ expect(gc('http://www.dailymotion.com/embed/video/x2k01a9?autoPlay=1&start=40')).to eq t
58
+ end
59
+
60
+ it 'supports direct playlist links' do
61
+ t = 'https://www.dailymotion.com/playlist/x1ybux'
62
+ expect(gc('https://www.dailymotion.com/playlist/x1ybux/1#video=xlbw3e')).to eq t
63
+ expect(gc('https://www.dailymotion.com/playlist/x1ybux')).to eq t
64
+ expect(gc('http://www.dailymotion.com/playlist/x1ybux_ODNandfinally_amazing-world-records/1#video=xlbw3e')).to eq t
65
+ end
66
+
67
+ it 'supports retrieving embed links as attributes' do
68
+ x = TrueURL.new('http://www.dailymotion.com/video/x2k01a9_battlefield-what-s-it-like-to-be-in-a-real-life-video-game_fun')
69
+ expect(x.attributes[:embed_url]).to eq 'https://www.dailymotion.com/embed/video/x2k01a9'
70
+ end
71
+
72
+ it 'supports force HTTPS' do
73
+ t = 'https://www.dailymotion.com/ODNandfinally'
74
+ expect(gc('http://www.dailymotion.com/ODNandfinally')).to eq t
75
+ end
76
+ end
77
+
78
+ describe 'Vimeo' do
79
+ it 'supports direct video links' do
80
+ t = 'https://vimeo.com/122258599'
81
+ expect(gc('https://vimeo.com/channels/staffpicks/122258599')).to eq t
82
+ expect(gc('http://vimeo.com/122258599')).to eq t
83
+ end
84
+
85
+ it 'supports embedded video links' do
86
+ t = 'https://vimeo.com/122258599'
87
+ expect(gc('https://player.vimeo.com/video/122258599?loop=1&color=c9ff23&title=0')).to eq t
88
+ end
89
+
90
+ it "supports Vimeo's relative canonical links" do
91
+ t = 'https://vimeo.com/channels/staffpicks'
92
+ expect(gc('http://vimeo.com/channels/staffpicks?some=silly&params=here')).to eq t
93
+ end
94
+
95
+ it 'supports retrieving embed links as attributes' do
96
+ x = TrueURL.new('https://vimeo.com/channels/staffpicks/122258599')
97
+ expect(x.attributes[:embed_url]).to eq 'https://player.vimeo.com/video/122258599'
98
+ end
99
+
100
+ it 'supports force HTTPS' do
101
+ t = 'https://vimeo.com/user3190002'
102
+ expect(gc('http://vimeo.com/user3190002')).to eq t
103
+ end
104
+ end
105
+
106
+ describe 'Nico Nico Douga' do
107
+ it 'should work with direct and embedded video links' do
108
+ t = 'http://www.nicovideo.jp/watch/sm25956031'
109
+ expect(gc('http://ext.nicovideo.jp/thumb_watch/sm25956031?w=490&h=307')).to eq t
110
+ expect(gc('http://embed.nicovideo.jp/watch/sm25956031/script?w=490&h=307&redirect=1')).to eq t
111
+ expect(gc('http://embed.nicovideo.jp/watch/sm25956031?oldScript=1')).to eq t
112
+ end
113
+
114
+ it 'supports retrieving embed links as attributes' do
115
+ x = TrueURL.new('http://ext.nicovideo.jp/thumb_watch/sm25956031?w=490&h=307')
116
+ expect(x.attributes[:embed_url]).to eq 'http://embed.nicovideo.jp/watch/sm25956031'
117
+ end
118
+ end
119
+
120
+ describe 'Twitter' do
121
+ it 'supports direct links to tweets' do
122
+ t = 'https://twitter.com/gangsta_project/status/578483098284748801'
123
+ expect(gc('https://twitter.com/GANGSTA_Project/status/578483098284748801/photo/1')).to eq t
124
+ expect(gc('https://twitter.com/GANGSTA_Project/status/578483098284748801/')).to eq t
125
+ end
126
+
127
+ it 'should handle hashbangs' do
128
+ t = 'https://twitter.com/gangsta_project/status/578483098284748801'
129
+ expect(gc('https://twitter.com/#!/GANGSTA_Project/status/578483098284748801/')).to eq t
130
+ end
131
+ end
132
+
133
+ describe 'URL Shorteners' do
134
+ it 'should work with t.co' do
135
+ t = 'http://www.prdaily.com/Main/Articles/3_essential_skills_for_todays_PR_pro__18404.aspx'
136
+ expect(gc('http://t.co/fvaGuRa5Za')).to eq t
137
+ expect(gc('https://t.co/fvaGuRa5Za')).to eq t
138
+ end
139
+
140
+ it 'should work with fb.me' do
141
+ t = 'https://www.facebook.com/aksuperdance/posts/1388968827814771'
142
+ expect(gc('http://fb.me/8qm5kW89k')).to eq t
143
+ end
144
+
145
+ it 'should work with ift.tt' do
146
+ t = 'http://tedxtaipei.com/articles/the_best_kindergarten_you_have_ever_seen/'
147
+ expect(gc('http://ift.tt/2iCbPy8')).to eq t
148
+ end
149
+
150
+ it 'should work with compounded URL shorteners' do
151
+ t = 'https://www.youtube.com/watch?v=jLhjsPjR-xk'
152
+ expect(gc('https://t.co/g4NYtZE3lW')).to eq t # http://bit.ly/2iCKic3 --> http://youtu.be/jLhjsPjR-xk?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY
153
+ end
154
+ end
155
+
156
+ describe 'WordPress' do
157
+ it 'supports missing trailing slashes' do
158
+ t = 'http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
159
+ expect(gc('http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled')).to eq t
160
+ end
161
+ end
162
+
163
+ describe 'Blogger' do
164
+ it 'supports missing localized Blogger domains' do
165
+ t = 'http://thevikiblog.blogspot.com/2015/12/soompi-ios-android.html'
166
+ expect(gc('http://thevikiblog.blogspot.sg/2015/12/soompi-ios-android.html')).to eq t
167
+ end
168
+ end
169
+
170
+ describe 'Other Scenarios' do
171
+ it 'supports missing schemes' do
172
+ t = 'http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
173
+ expect(gc('//wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/')).to eq t
174
+ end
175
+
176
+ it 'supports scheme override' do
177
+ t = 'https://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
178
+ expect(gc('//wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/', scheme_override: 'https')).to eq t
179
+ end
180
+
181
+ it 'supports CDJapan' do
182
+ t = 'http://www.cdjapan.co.jp/product/MDR-1012'
183
+ expect(gc('http://www.cdjapan.co.jp/aff/click.cgi/e86NDzbdSLQ/4323/A323439/detailview.html?KEY=MDR-1012')).to eq t
184
+ end
185
+
186
+ it 'supports MyAnimeList' do
187
+ t = 'https://myanimelist.net/forum/?topicid=1371295'
188
+ expect(gc('https://myanimelist.net/forum?topicid=1371295&goto=newpost')).to eq t
189
+ end
190
+
191
+ it 'supports URLs with escapable characters' do
192
+ t = 'http://goboiano.com/news/2568-attack-on-titan%2527s-first-live-action-trailer-finally-launches'
193
+ expect(gc("http://media.goboiano.com/news/2568-attack-on-titan's-first-live-action-trailer-finally-launches")).to eq t
194
+
195
+ t = 'http://randomc.net/image/Kekkai%20Sensen/Kekkai%20Sensen%20-%2001%20-%20Large%2001.jpg'
196
+ expect(gc('http://randomc.net/image/Kekkai Sensen/Kekkai Sensen - 01 - Large 01.jpg')).to eq t
197
+ end
198
+
199
+ it 'supports canonical HTTP headers' do
200
+ t = 'http://www.seoreviewtools.com/canonical-url-location-checker/'
201
+ expect(gc('http://www.seoreviewtools.com/tests/canonical-header.php')).to eq t
202
+ end
203
+ end
204
+ end
data/true_url.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ require File.expand_path('../lib/true_url/version', __FILE__)
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = 'true_url'
5
+ spec.version = TrueURL::VERSION
6
+ spec.authors = ['Jonathan Wong']
7
+ spec.email = ['jonathan@armchairtheorist.com']
8
+ spec.summary = 'A multi-strategy approach to find the absolutely cleanest and most likely canonical URL of any given URL.'
9
+ spec.homepage = 'http://github.com/armchairtheorist/true_url'
10
+ spec.license = 'MIT'
11
+
12
+ spec.files = `git ls-files`.split("\n")
13
+ spec.test_files = `git ls-files -- {spec}/*`.split("\n")
14
+ spec.require_paths = ['lib']
15
+
16
+ spec.add_development_dependency 'rspec', '~> 0'
17
+ spec.add_development_dependency 'rake', '~> 0'
18
+ spec.add_development_dependency 'http', '~> 2.1', ">= 2.1.0"
19
+ spec.add_development_dependency 'nokogiri', '~> 1.6', ">= 1.6.8"
20
+
21
+ spec.add_runtime_dependency 'addressable', '~> 2.4', ">= 2.4.0"
22
+ end
metadata ADDED
@@ -0,0 +1,151 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: true_url
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jonathan Wong
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-12-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: http
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2.1'
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: 2.1.0
51
+ type: :development
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '2.1'
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 2.1.0
61
+ - !ruby/object:Gem::Dependency
62
+ name: nokogiri
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.6'
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: 1.6.8
71
+ type: :development
72
+ prerelease: false
73
+ version_requirements: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - "~>"
76
+ - !ruby/object:Gem::Version
77
+ version: '1.6'
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 1.6.8
81
+ - !ruby/object:Gem::Dependency
82
+ name: addressable
83
+ requirement: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - "~>"
86
+ - !ruby/object:Gem::Version
87
+ version: '2.4'
88
+ - - ">="
89
+ - !ruby/object:Gem::Version
90
+ version: 2.4.0
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '2.4'
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: 2.4.0
101
+ description:
102
+ email:
103
+ - jonathan@armchairtheorist.com
104
+ executables: []
105
+ extensions: []
106
+ extra_rdoc_files: []
107
+ files:
108
+ - ".gitignore"
109
+ - ".rspec"
110
+ - Gemfile
111
+ - LICENSE.txt
112
+ - README.md
113
+ - Rakefile
114
+ - lib/true_url.rb
115
+ - lib/true_url/context.rb
116
+ - lib/true_url/strategy.rb
117
+ - lib/true_url/strategy/dailymotion.rb
118
+ - lib/true_url/strategy/nicovideo.rb
119
+ - lib/true_url/strategy/twitter.rb
120
+ - lib/true_url/strategy/vimeo.rb
121
+ - lib/true_url/strategy/youtube.rb
122
+ - lib/true_url/version.rb
123
+ - spec/spec_helper.rb
124
+ - spec/true_url_spec.rb
125
+ - true_url.gemspec
126
+ homepage: http://github.com/armchairtheorist/true_url
127
+ licenses:
128
+ - MIT
129
+ metadata: {}
130
+ post_install_message:
131
+ rdoc_options: []
132
+ require_paths:
133
+ - lib
134
+ required_ruby_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ required_rubygems_version: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - ">="
142
+ - !ruby/object:Gem::Version
143
+ version: '0'
144
+ requirements: []
145
+ rubyforge_project:
146
+ rubygems_version: 2.5.1
147
+ signing_key:
148
+ specification_version: 4
149
+ summary: A multi-strategy approach to find the absolutely cleanest and most likely
150
+ canonical URL of any given URL.
151
+ test_files: []