url_canonicalize 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +51 -0
- data/.hound.yml +4 -0
- data/.rspec +5 -0
- data/.rubocop.yml +55 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/Gemfile +17 -0
- data/Gemfile.local.example +11 -0
- data/Gemfile.lock +131 -0
- data/Guardfile +16 -0
- data/LICENSE +21 -0
- data/README.md +32 -0
- data/Rakefile +1 -0
- data/circle.yml +18 -0
- data/lib/monkey_patches/addressable/uri.rb +10 -0
- data/lib/monkey_patches/string.rb +21 -0
- data/lib/monkey_patches/uri.rb +9 -0
- data/lib/url_canonicalize/exception.rb +9 -0
- data/lib/url_canonicalize/http.rb +155 -0
- data/lib/url_canonicalize/request.rb +144 -0
- data/lib/url_canonicalize/response.rb +39 -0
- data/lib/url_canonicalize/uri.rb +25 -0
- data/lib/url_canonicalize/version.rb +3 -0
- data/lib/url_canonicalize.rb +30 -0
- data/url_canonicalize.gemspec +28 -0
- metadata +98 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3b018935dfe00ea2df8f9265b439a846141a8b03
|
4
|
+
data.tar.gz: 0e103350c0c91b1ae261dd2996b112ef35aa6790
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e30f6a713c580cb396ccf9b717a32b8b56ff61f8aa8b90ab9fff6981383e7d78a90f4c343ae6cd2cb492aea745601c5c195b2cfd4849d16afe8a01b24d9b5672
|
7
|
+
data.tar.gz: 068cf183a3d2050de607f5d098cdf37defe6dc68ec3210f67c3d8fce045e63d4129bc79b870f42a25755594572558a9e70272aff421a476eb84c76e851784979
|
data/.gitignore
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
*.local
|
2
|
+
*.gem
|
3
|
+
*.rbc
|
4
|
+
/.config
|
5
|
+
/coverage/
|
6
|
+
/InstalledFiles
|
7
|
+
/pkg/
|
8
|
+
/spec/reports/
|
9
|
+
/spec/examples.txt
|
10
|
+
/test/tmp/
|
11
|
+
/test/version_tmp/
|
12
|
+
/tmp/
|
13
|
+
|
14
|
+
# Used by dotenv library to load environment variables.
|
15
|
+
# .env
|
16
|
+
|
17
|
+
## Specific to RubyMotion:
|
18
|
+
.dat*
|
19
|
+
.repl_history
|
20
|
+
build/
|
21
|
+
*.bridgesupport
|
22
|
+
build-iPhoneOS/
|
23
|
+
build-iPhoneSimulator/
|
24
|
+
|
25
|
+
## Specific to RubyMotion (use of CocoaPods):
|
26
|
+
#
|
27
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
28
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
29
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
30
|
+
#
|
31
|
+
# vendor/Pods/
|
32
|
+
|
33
|
+
## Documentation cache and generated files:
|
34
|
+
/.yardoc/
|
35
|
+
/_yardoc/
|
36
|
+
/doc/
|
37
|
+
/rdoc/
|
38
|
+
|
39
|
+
## Environment normalization:
|
40
|
+
/.bundle/
|
41
|
+
/vendor/bundle
|
42
|
+
/lib/bundler/man/
|
43
|
+
|
44
|
+
# for a library or gem, you might want to ignore these files since the code is
|
45
|
+
# intended to run in multiple environments; otherwise, check them in:
|
46
|
+
# Gemfile.lock
|
47
|
+
# .ruby-version
|
48
|
+
# .ruby-gemset
|
49
|
+
|
50
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
51
|
+
.rvmrc
|
data/.hound.yml
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
---
|
2
|
+
AllCops:
|
3
|
+
Exclude:
|
4
|
+
- '*.gemspec'
|
5
|
+
|
6
|
+
Style/TrailingCommaInArguments:
|
7
|
+
EnforcedStyleForMultiline: no_comma
|
8
|
+
Enabled: true
|
9
|
+
|
10
|
+
StringLiterals:
|
11
|
+
EnforcedStyle: single_quotes
|
12
|
+
Enabled: true
|
13
|
+
|
14
|
+
LineLength:
|
15
|
+
Max: 120
|
16
|
+
Exclude:
|
17
|
+
- 'spec/**/*'
|
18
|
+
|
19
|
+
MethodLength:
|
20
|
+
Max: 12
|
21
|
+
|
22
|
+
DotPosition:
|
23
|
+
Description: 'Checks the position of the dot in multi-line method calls.'
|
24
|
+
EnforcedStyle: leading
|
25
|
+
Enabled: true
|
26
|
+
|
27
|
+
ClassAndModuleChildren:
|
28
|
+
Description: 'Checks style of children classes and modules.'
|
29
|
+
EnforcedStyle: nested
|
30
|
+
Enabled: true
|
31
|
+
|
32
|
+
Documentation:
|
33
|
+
Description: 'Document classes and non-namespace modules.'
|
34
|
+
Enabled: false
|
35
|
+
|
36
|
+
FileName:
|
37
|
+
Description: 'Use snake_case for source file names.'
|
38
|
+
Enabled: true
|
39
|
+
|
40
|
+
Style/SymbolArray:
|
41
|
+
Description: 'Use %i or %I for arrays of symbols.'
|
42
|
+
StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#percent-i'
|
43
|
+
Enabled: false # Only available in Ruby 2.0+
|
44
|
+
|
45
|
+
Style/ExtraSpacing:
|
46
|
+
Description: 'Do not use unnecessary spacing.'
|
47
|
+
Enabled: true
|
48
|
+
|
49
|
+
Lint/LiteralInInterpolation:
|
50
|
+
Description: 'Avoid interpolating literals in strings'
|
51
|
+
AutoCorrect: true
|
52
|
+
|
53
|
+
Metrics/ClassLength:
|
54
|
+
CountComments: false # count full line comments?
|
55
|
+
Max: 120
|
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
url_canonicalize
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1.9
|
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
gemspec
|
4
|
+
|
5
|
+
group :test do
|
6
|
+
gem 'rspec'
|
7
|
+
gem 'rspec_junit_formatter'
|
8
|
+
gem 'webmock'
|
9
|
+
gem 'simplecov'
|
10
|
+
gem 'coveralls', require: false
|
11
|
+
end
|
12
|
+
|
13
|
+
local_gemfile = 'Gemfile.local'
|
14
|
+
|
15
|
+
if File.exist?(local_gemfile)
|
16
|
+
eval(File.read(local_gemfile)) # rubocop:disable Lint/Eval
|
17
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# Copy this file to a file named Gemfile.local to add these gems to your dev toolset
|
2
|
+
# Feel free to modify Gemfile.local to suit your own preferences
|
3
|
+
group :development do
|
4
|
+
gem 'rake'
|
5
|
+
gem 'gem-release'
|
6
|
+
gem 'rubocop'
|
7
|
+
gem 'listen', '~> 3.0', '< 3.1' # Dependency of guard, 3.1 requires Ruby 2.2+
|
8
|
+
gem 'guard'
|
9
|
+
gem 'guard-rspec'
|
10
|
+
gem 'guard-rubocop'
|
11
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
url_canonicalize (0.0.1)
|
5
|
+
addressable (~> 2)
|
6
|
+
nokogiri (~> 1)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
addressable (2.4.0)
|
12
|
+
ast (2.3.0)
|
13
|
+
builder (3.2.2)
|
14
|
+
coderay (1.1.1)
|
15
|
+
coveralls (0.8.15)
|
16
|
+
json (>= 1.8, < 3)
|
17
|
+
simplecov (~> 0.12.0)
|
18
|
+
term-ansicolor (~> 1.3)
|
19
|
+
thor (~> 0.19.1)
|
20
|
+
tins (>= 1.6.0, < 2)
|
21
|
+
crack (0.4.3)
|
22
|
+
safe_yaml (~> 1.0.0)
|
23
|
+
diff-lcs (1.2.5)
|
24
|
+
docile (1.1.5)
|
25
|
+
ffi (1.9.14)
|
26
|
+
formatador (0.2.5)
|
27
|
+
gem-release (0.7.4)
|
28
|
+
guard (2.14.0)
|
29
|
+
formatador (>= 0.2.4)
|
30
|
+
listen (>= 2.7, < 4.0)
|
31
|
+
lumberjack (~> 1.0)
|
32
|
+
nenv (~> 0.1)
|
33
|
+
notiffany (~> 0.0)
|
34
|
+
pry (>= 0.9.12)
|
35
|
+
shellany (~> 0.0)
|
36
|
+
thor (>= 0.18.1)
|
37
|
+
guard-compat (1.2.1)
|
38
|
+
guard-rspec (4.7.3)
|
39
|
+
guard (~> 2.1)
|
40
|
+
guard-compat (~> 1.1)
|
41
|
+
rspec (>= 2.99.0, < 4.0)
|
42
|
+
guard-rubocop (1.2.0)
|
43
|
+
guard (~> 2.0)
|
44
|
+
rubocop (~> 0.20)
|
45
|
+
hashdiff (0.3.0)
|
46
|
+
json (2.0.2)
|
47
|
+
listen (3.0.8)
|
48
|
+
rb-fsevent (~> 0.9, >= 0.9.4)
|
49
|
+
rb-inotify (~> 0.9, >= 0.9.7)
|
50
|
+
lumberjack (1.0.10)
|
51
|
+
method_source (0.8.2)
|
52
|
+
mini_portile2 (2.1.0)
|
53
|
+
nenv (0.3.0)
|
54
|
+
nokogiri (1.6.8.1)
|
55
|
+
mini_portile2 (~> 2.1.0)
|
56
|
+
notiffany (0.1.1)
|
57
|
+
nenv (~> 0.1)
|
58
|
+
shellany (~> 0.0)
|
59
|
+
parser (2.3.1.4)
|
60
|
+
ast (~> 2.2)
|
61
|
+
powerpack (0.1.1)
|
62
|
+
pry (0.10.4)
|
63
|
+
coderay (~> 1.1.0)
|
64
|
+
method_source (~> 0.8.1)
|
65
|
+
slop (~> 3.4)
|
66
|
+
rainbow (2.1.0)
|
67
|
+
rake (11.3.0)
|
68
|
+
rb-fsevent (0.9.7)
|
69
|
+
rb-inotify (0.9.7)
|
70
|
+
ffi (>= 0.5.0)
|
71
|
+
rspec (3.5.0)
|
72
|
+
rspec-core (~> 3.5.0)
|
73
|
+
rspec-expectations (~> 3.5.0)
|
74
|
+
rspec-mocks (~> 3.5.0)
|
75
|
+
rspec-core (3.5.4)
|
76
|
+
rspec-support (~> 3.5.0)
|
77
|
+
rspec-expectations (3.5.0)
|
78
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
79
|
+
rspec-support (~> 3.5.0)
|
80
|
+
rspec-mocks (3.5.0)
|
81
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
82
|
+
rspec-support (~> 3.5.0)
|
83
|
+
rspec-support (3.5.0)
|
84
|
+
rspec_junit_formatter (0.2.3)
|
85
|
+
builder (< 4)
|
86
|
+
rspec-core (>= 2, < 4, != 2.12.0)
|
87
|
+
rubocop (0.44.1)
|
88
|
+
parser (>= 2.3.1.1, < 3.0)
|
89
|
+
powerpack (~> 0.1)
|
90
|
+
rainbow (>= 1.99.1, < 3.0)
|
91
|
+
ruby-progressbar (~> 1.7)
|
92
|
+
unicode-display_width (~> 1.0, >= 1.0.1)
|
93
|
+
ruby-progressbar (1.8.1)
|
94
|
+
safe_yaml (1.0.4)
|
95
|
+
shellany (0.0.1)
|
96
|
+
simplecov (0.12.0)
|
97
|
+
docile (~> 1.1.0)
|
98
|
+
json (>= 1.8, < 3)
|
99
|
+
simplecov-html (~> 0.10.0)
|
100
|
+
simplecov-html (0.10.0)
|
101
|
+
slop (3.6.0)
|
102
|
+
term-ansicolor (1.4.0)
|
103
|
+
tins (~> 1.0)
|
104
|
+
thor (0.19.1)
|
105
|
+
tins (1.12.0)
|
106
|
+
unicode-display_width (1.1.1)
|
107
|
+
webmock (2.1.0)
|
108
|
+
addressable (>= 2.3.6)
|
109
|
+
crack (>= 0.3.2)
|
110
|
+
hashdiff
|
111
|
+
|
112
|
+
PLATFORMS
|
113
|
+
ruby
|
114
|
+
|
115
|
+
DEPENDENCIES
|
116
|
+
coveralls
|
117
|
+
gem-release
|
118
|
+
guard
|
119
|
+
guard-rspec
|
120
|
+
guard-rubocop
|
121
|
+
listen (~> 3.0, < 3.1)
|
122
|
+
rake
|
123
|
+
rspec
|
124
|
+
rspec_junit_formatter
|
125
|
+
rubocop
|
126
|
+
simplecov
|
127
|
+
url_canonicalize!
|
128
|
+
webmock
|
129
|
+
|
130
|
+
BUNDLED WITH
|
131
|
+
1.13.5
|
data/Guardfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
guard :rubocop do
|
2
|
+
watch(/.+\.rb$/)
|
3
|
+
watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
|
4
|
+
end
|
5
|
+
|
6
|
+
guard(
|
7
|
+
:rspec,
|
8
|
+
all_after_pass: false,
|
9
|
+
all_on_start: false,
|
10
|
+
cmd: 'NO_SIMPLECOV=true bundle exec rspec --fail-fast --format documentation'
|
11
|
+
) do
|
12
|
+
watch(%r{spec/.+_spec\.rb$})
|
13
|
+
watch(%r{lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
14
|
+
watch('spec/spec_helper.rb') { 'spec' }
|
15
|
+
watch(%r{^spec/support/.+\.rb$}) { 'spec' }
|
16
|
+
end
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2016 Xenapto
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# URLCanonicalize
|
2
|
+
|
3
|
+
URLCanonicalize is a Ruby gem that finds the canonical version of a URL. It
|
4
|
+
provides `canonicalize` methods for the String, URI::HTTP, URI::HTTPS and
|
5
|
+
Addressable::URI classes.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'url_canonicalize'
|
13
|
+
```
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
'http://www.twitter.com'.canonicalize # => 'https://twitter.com/'
|
19
|
+
URI('http://www.twitter.com').canonicalize # => #<URI::HTTP:0x00000008767908 URL:https://twitter.com/>
|
20
|
+
Addressable::URI.canonicalize('http://www.twitter.com') # => #<Addressable::URI:0x43c9 URI:https://twitter.com/>
|
21
|
+
```
|
22
|
+
|
23
|
+
## More Information
|
24
|
+
|
25
|
+
URLCanonical follows HTTP redirects and also looks for `rel="canonical"` hints
|
26
|
+
in both the HTTP headers and the `<head>` section of the response HTML. The URL
|
27
|
+
it returns will be both normalized and canonical. The intention is that
|
28
|
+
whatever variant of a URL is supplied the result will always be the same. The
|
29
|
+
intended use case is for applications that need to dedupe a list of URLs, for
|
30
|
+
instance to check if a new URL is already present in a list. If the list is
|
31
|
+
built from canonicalized URLs then the resulting set will have fewer URLs that
|
32
|
+
point to the same ultimate resource.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bundler/gem_tasks'
|
data/circle.yml
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
machine:
|
2
|
+
ruby:
|
3
|
+
version: 2.1.9
|
4
|
+
|
5
|
+
dependencies:
|
6
|
+
pre:
|
7
|
+
- echo "export rvm_ignore_gemsets_flag=1" >> ~/.rvmrc
|
8
|
+
- gem install bundler
|
9
|
+
override:
|
10
|
+
- bundle check --path=vendor/bundle || bundle install --path=vendor/bundle --jobs=4 --retry=3 --full-index
|
11
|
+
|
12
|
+
test:
|
13
|
+
override:
|
14
|
+
- bundle exec rspec:
|
15
|
+
timeout: 600
|
16
|
+
parallel: true
|
17
|
+
files:
|
18
|
+
- spec/**/*_spec.rb
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Addressable
|
2
|
+
# Patch for Addressable's URI class
|
3
|
+
class URI
|
4
|
+
def self.canonicalize(uri)
|
5
|
+
url = parse(uri).to_s # uri can be anything Addressable::URI can handle
|
6
|
+
canonical_url = URLCanonicalize.canonicalize(url)
|
7
|
+
parse(canonical_url)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Patch for Ruby's String class
|
2
|
+
class String
|
3
|
+
def canonicalize
|
4
|
+
URLCanonicalize.canonicalize(self)
|
5
|
+
end
|
6
|
+
|
7
|
+
def ansi_attributes(*args)
|
8
|
+
"\e[#{args.join(';')}m#{self}\e[0m"
|
9
|
+
end
|
10
|
+
|
11
|
+
colors = %w(black red green yellow blue magenta cyan white)
|
12
|
+
|
13
|
+
colors.each_with_index do |fg_color, i|
|
14
|
+
fg = 30 + i
|
15
|
+
define_method(fg_color) { ansi_attributes(fg) }
|
16
|
+
|
17
|
+
colors.each_with_index do |bg_color, j|
|
18
|
+
define_method("#{fg_color}_on_#{bg_color}") { ansi_attributes(fg, 40 + j) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
module URLCanonicalize
|
2
|
+
# Persistent connection for possible repeated requests to the same host
|
3
|
+
class HTTP
|
4
|
+
def fetch
|
5
|
+
loop { break last_known_good if handle_response }
|
6
|
+
end
|
7
|
+
|
8
|
+
def uri
|
9
|
+
@uri ||= URLCanonicalize::URI.parse(url) # Malformed URLs will raise a URLCanonicalize exception
|
10
|
+
end
|
11
|
+
|
12
|
+
def url=(value)
|
13
|
+
@url = value.to_s
|
14
|
+
@uri = nil
|
15
|
+
end
|
16
|
+
|
17
|
+
def request(request_object)
|
18
|
+
http.request request_object
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
attr_reader :last_known_good
|
24
|
+
|
25
|
+
def initialize(raw_url)
|
26
|
+
@raw_url = raw_url
|
27
|
+
end
|
28
|
+
|
29
|
+
# Fetch the response
|
30
|
+
def response
|
31
|
+
@response ||= Request.new(self).fetch
|
32
|
+
end
|
33
|
+
|
34
|
+
# Parse the response, and clear the response ready to follow the next redirect
|
35
|
+
def handle_response
|
36
|
+
result = parse_response
|
37
|
+
@response = nil
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
41
|
+
# Parse the response
|
42
|
+
def parse_response
|
43
|
+
case response
|
44
|
+
when Net::HTTPSuccess
|
45
|
+
handle_success
|
46
|
+
when URLCanonicalize::Response::Redirect
|
47
|
+
redirect_loop_detected? || max_redirects_reached?
|
48
|
+
when URLCanonicalize::Response::CanonicalFound
|
49
|
+
handle_canonical_found
|
50
|
+
when URLCanonicalize::Response::Failure
|
51
|
+
handle_failure
|
52
|
+
else
|
53
|
+
handle_unhandled_response
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def redirect_loop_detected?
|
58
|
+
if redirect_list.include?(response.url)
|
59
|
+
return true if last_known_good
|
60
|
+
raise URLCanonicalize::Exception::Redirect, 'Redirect loop detected'
|
61
|
+
end
|
62
|
+
|
63
|
+
redirect_list << response.url
|
64
|
+
increment_redirects
|
65
|
+
set_url_from_response
|
66
|
+
false
|
67
|
+
end
|
68
|
+
|
69
|
+
def max_redirects_reached?
|
70
|
+
return false unless @redirects > options[:max_redirects]
|
71
|
+
return true if last_known_good
|
72
|
+
raise URLCanonicalize::Exception::Redirect, "#{@redirects} redirects is too many"
|
73
|
+
end
|
74
|
+
|
75
|
+
def redirect_list
|
76
|
+
@redirect_list ||= []
|
77
|
+
end
|
78
|
+
|
79
|
+
def increment_redirects
|
80
|
+
@redirects = redirects + 1
|
81
|
+
end
|
82
|
+
|
83
|
+
def redirects
|
84
|
+
@redirects ||= 0
|
85
|
+
end
|
86
|
+
|
87
|
+
def handle_canonical_found
|
88
|
+
@last_known_good = response.response
|
89
|
+
return true if response.url == url || redirect_list.include?(response.url)
|
90
|
+
set_url_from_response
|
91
|
+
false
|
92
|
+
end
|
93
|
+
|
94
|
+
def set_url_from_response
|
95
|
+
self.url = response.url
|
96
|
+
end
|
97
|
+
|
98
|
+
def handle_failure
|
99
|
+
return true if last_known_good
|
100
|
+
raise URLCanonicalize::Exception::Failure, "#{response.failure_class}: #{response.message}"
|
101
|
+
end
|
102
|
+
|
103
|
+
def handle_unhandled_response
|
104
|
+
raise URLCanonicalize::Exception::Failure, "Unhandled response type: #{response.class}"
|
105
|
+
end
|
106
|
+
|
107
|
+
def handle_success
|
108
|
+
@last_known_good = response
|
109
|
+
true
|
110
|
+
end
|
111
|
+
|
112
|
+
def url
|
113
|
+
@url ||= @raw_url.to_s
|
114
|
+
end
|
115
|
+
|
116
|
+
def http
|
117
|
+
return @http if same_host_and_port # reuse connection
|
118
|
+
|
119
|
+
@previous = uri
|
120
|
+
@http = new_http
|
121
|
+
end
|
122
|
+
|
123
|
+
def same_host_and_port
|
124
|
+
uri.host == previous.host && uri.port == previous.port
|
125
|
+
end
|
126
|
+
|
127
|
+
def previous
|
128
|
+
@previous ||= Struct.new(:host, :port).new
|
129
|
+
end
|
130
|
+
|
131
|
+
def new_http
|
132
|
+
h = Net::HTTP.new uri.host, uri.port
|
133
|
+
|
134
|
+
h.open_timeout = options[:open_timeout]
|
135
|
+
h.read_timeout = options[:read_timeout]
|
136
|
+
|
137
|
+
if uri.scheme == 'https'
|
138
|
+
h.use_ssl = true # Can generate exception
|
139
|
+
h.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
140
|
+
else
|
141
|
+
h.use_ssl = false
|
142
|
+
end
|
143
|
+
|
144
|
+
h
|
145
|
+
end
|
146
|
+
|
147
|
+
def options
|
148
|
+
@options ||= {
|
149
|
+
open_timeout: 8, # Twitter responds in >5s
|
150
|
+
read_timeout: 15,
|
151
|
+
max_redirects: 10
|
152
|
+
}
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module URLCanonicalize
|
2
|
+
# Make an HTTP request
|
3
|
+
class Request
|
4
|
+
def fetch
|
5
|
+
handle_response
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
|
10
|
+
attr_reader :http, :http_method
|
11
|
+
|
12
|
+
def initialize(http, http_method = :head)
|
13
|
+
@http = http
|
14
|
+
@http_method = http_method
|
15
|
+
end
|
16
|
+
|
17
|
+
def response
|
18
|
+
@response ||= http.request request # Some URLs can throw an exception here
|
19
|
+
end
|
20
|
+
|
21
|
+
def request
|
22
|
+
@request ||= request_for_method
|
23
|
+
end
|
24
|
+
|
25
|
+
def handle_response
|
26
|
+
case response
|
27
|
+
when Net::HTTPSuccess
|
28
|
+
look_for_canonical
|
29
|
+
when Net::HTTPRedirection
|
30
|
+
handle_redirection
|
31
|
+
else
|
32
|
+
handle_failure
|
33
|
+
end
|
34
|
+
rescue *NETWORK_EXCEPTIONS => e
|
35
|
+
handle_failure(e.class, e.message)
|
36
|
+
end
|
37
|
+
|
38
|
+
def look_for_canonical
|
39
|
+
# Look in response Link header
|
40
|
+
if response['link'] =~ /<(?<url>.+)>\s*;\s*rel="canonical"/i
|
41
|
+
URLCanonicalize::Response::CanonicalFound.new($LAST_MATCH_INFO['url'])
|
42
|
+
elsif http_method == :head
|
43
|
+
self.http_method = :get
|
44
|
+
fetch
|
45
|
+
else
|
46
|
+
canonical_url ? URLCanonicalize::Response::CanonicalFound.new(canonical_url, response) : response
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def handle_redirection
|
51
|
+
case response
|
52
|
+
when Net::HTTPFound, Net::HTTPMovedTemporarily, Net::HTTPTemporaryRedirect
|
53
|
+
self.http_method = :get
|
54
|
+
look_for_canonical
|
55
|
+
else
|
56
|
+
URLCanonicalize::Response::Redirect.new(response['location'])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def handle_failure(klass = response.class, message = response.message)
|
61
|
+
URLCanonicalize::Response::Failure.new(klass, message)
|
62
|
+
end
|
63
|
+
|
64
|
+
def html
|
65
|
+
@html ||= Nokogiri::HTML response.body
|
66
|
+
end
|
67
|
+
|
68
|
+
def canonical_url_element
|
69
|
+
@canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first
|
70
|
+
end
|
71
|
+
|
72
|
+
def canonical_url
|
73
|
+
@canonical_url ||= canonical_url_element['href'] if @canonical_url_element.is_a?(Nokogiri::XML::Element)
|
74
|
+
end
|
75
|
+
|
76
|
+
def uri
|
77
|
+
@uri ||= http.uri
|
78
|
+
end
|
79
|
+
|
80
|
+
def url
|
81
|
+
@url ||= uri.to_s
|
82
|
+
end
|
83
|
+
|
84
|
+
def host
|
85
|
+
@host ||= uri.host
|
86
|
+
end
|
87
|
+
|
88
|
+
def request_for_method
|
89
|
+
r = base_request
|
90
|
+
headers.each { |header_key, header_value| r[header_key] = header_value }
|
91
|
+
r
|
92
|
+
end
|
93
|
+
|
94
|
+
def base_request
|
95
|
+
check_http_method
|
96
|
+
|
97
|
+
case http_method
|
98
|
+
when :head
|
99
|
+
Net::HTTP::Head.new uri
|
100
|
+
when :get
|
101
|
+
Net::HTTP::Get.new uri
|
102
|
+
else
|
103
|
+
raise URLCanonicalize::Exception::Request, "Unknown method: #{method}"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def headers
|
108
|
+
@headers ||= {
|
109
|
+
'Accept-Language' => 'en-US,en;q=0.8',
|
110
|
+
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; WOW64) '\
|
111
|
+
'AppleWebKit/537.36 (KHTML, like Gecko) '\
|
112
|
+
'Chrome/51.0.2704.103 Safari/537.36'
|
113
|
+
}
|
114
|
+
end
|
115
|
+
|
116
|
+
def http_method=(value)
|
117
|
+
@http_method = value
|
118
|
+
@request = nil
|
119
|
+
@response = nil
|
120
|
+
end
|
121
|
+
|
122
|
+
# Some sites treat HEAD requests as suspicious activity and block the
|
123
|
+
# requester after a few attempts. For these sites we'll use GET requests
|
124
|
+
# only
|
125
|
+
def check_http_method
|
126
|
+
@http_method = :get if host =~ /(linkedin|crunchbase).com/
|
127
|
+
end
|
128
|
+
|
129
|
+
NETWORK_EXCEPTIONS = [
|
130
|
+
EOFError,
|
131
|
+
Errno::ECONNREFUSED,
|
132
|
+
Errno::ECONNRESET,
|
133
|
+
Errno::EHOSTUNREACH,
|
134
|
+
Errno::EINVAL,
|
135
|
+
Errno::ENETUNREACH,
|
136
|
+
Errno::ETIMEDOUT,
|
137
|
+
Net::OpenTimeout,
|
138
|
+
Net::ReadTimeout,
|
139
|
+
OpenSSL::SSL::SSLError,
|
140
|
+
SocketError,
|
141
|
+
Timeout::Error
|
142
|
+
].freeze
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module URLCanonicalize
|
2
|
+
# The response from an HTTP request
|
3
|
+
module Response
|
4
|
+
class Generic
|
5
|
+
attr_reader :url
|
6
|
+
|
7
|
+
private
|
8
|
+
|
9
|
+
def initialize(url)
|
10
|
+
@url = url
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
Redirect = Class.new(Generic)
|
15
|
+
|
16
|
+
class CanonicalFound < Generic
|
17
|
+
attr_reader :response
|
18
|
+
|
19
|
+
private
|
20
|
+
|
21
|
+
def initialize(url, response)
|
22
|
+
@url = url
|
23
|
+
@response = response
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# It barfed
|
28
|
+
class Failure
|
29
|
+
attr_reader :failure_class, :message
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def initialize(failure_class, message)
|
34
|
+
@failure_class = failure_class
|
35
|
+
@message = message
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module URLCanonicalize
|
2
|
+
# Manage the URL into a URI with local exception handling
|
3
|
+
class URI
|
4
|
+
class << self
|
5
|
+
def parse(url)
|
6
|
+
uri = ::URI.parse(url)
|
7
|
+
uri if valid?(uri)
|
8
|
+
rescue ::URI::InvalidURIError => e
|
9
|
+
new_exception = URLCanonicalize::Exception::URI.new("#{e.class}: #{e.message}")
|
10
|
+
new_exception.set_backtrace e.backtrace
|
11
|
+
raise new_exception
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def valid?(uri)
|
17
|
+
raise URLCanonicalize::Exception::URI, "#{uri} must be http or https" unless VALID_CLASSES.include?(uri.class)
|
18
|
+
raise URLCanonicalize::Exception::URI, "Missing host name in #{uri}" unless uri.host
|
19
|
+
true
|
20
|
+
end
|
21
|
+
|
22
|
+
VALID_CLASSES = [::URI::HTTP, ::URI::HTTPS].freeze
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'addressable/uri'
|
3
|
+
require 'net/http'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
autoload :OpenSSL, 'openssl'
|
7
|
+
|
8
|
+
# Core methods
|
9
|
+
module URLCanonicalize
|
10
|
+
autoload :Exception, 'url_canonicalize/exception'
|
11
|
+
autoload :HTTP, 'url_canonicalize/http'
|
12
|
+
autoload :Request, 'url_canonicalize/request'
|
13
|
+
autoload :Response, 'url_canonicalize/response'
|
14
|
+
autoload :URI, 'url_canonicalize/uri'
|
15
|
+
autoload :VERSION, 'url_canonicalize/version'
|
16
|
+
|
17
|
+
class << self
|
18
|
+
def canonicalize(url)
|
19
|
+
fetch(url).uri.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
def fetch(url)
|
23
|
+
URLCanonicalize::HTTP.new(url).fetch
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
require 'monkey_patches/uri'
|
29
|
+
require 'monkey_patches/string'
|
30
|
+
require 'monkey_patches/addressable/uri'
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'url_canonicalize/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = 'url_canonicalize'
|
8
|
+
s.version = URLCanonicalize::VERSION
|
9
|
+
s.authors = ['Dominic Sayers']
|
10
|
+
s.email = ['developers@xenapto.com']
|
11
|
+
s.summary = 'Finds the canonical version of a URL'
|
12
|
+
s.description = 'Rubygem that finds the canonical version of a URL by '\
|
13
|
+
'providing #canonicalize methods for the String, URI::HTTP'\
|
14
|
+
', URI::HTTPS and Addressable::URI classes'
|
15
|
+
s.homepage = 'https://github.com/Xenapto/url_canonicalize'
|
16
|
+
s.license = 'MIT'
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split($RS).reject do |file|
|
19
|
+
file =~ /^spec\//
|
20
|
+
end
|
21
|
+
|
22
|
+
s.test_files = []
|
23
|
+
s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
24
|
+
s.require_paths = ['lib']
|
25
|
+
|
26
|
+
s.add_dependency 'addressable', '~> 2' # To normalize URLs
|
27
|
+
s.add_dependency 'nokogiri', '~> 1' # To look for <link rel="canonical" ...> in HTML
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: url_canonicalize
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Dominic Sayers
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-10-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: addressable
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1'
|
41
|
+
description: 'Rubygem that finds the canonical version of a URL by providing #canonicalize
|
42
|
+
methods for the String, URI::HTTP, URI::HTTPS and Addressable::URI classes'
|
43
|
+
email:
|
44
|
+
- developers@xenapto.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- ".gitignore"
|
50
|
+
- ".hound.yml"
|
51
|
+
- ".rspec"
|
52
|
+
- ".rubocop.yml"
|
53
|
+
- ".ruby-gemset"
|
54
|
+
- ".ruby-version"
|
55
|
+
- Gemfile
|
56
|
+
- Gemfile.local.example
|
57
|
+
- Gemfile.lock
|
58
|
+
- Guardfile
|
59
|
+
- LICENSE
|
60
|
+
- README.md
|
61
|
+
- Rakefile
|
62
|
+
- circle.yml
|
63
|
+
- lib/monkey_patches/addressable/uri.rb
|
64
|
+
- lib/monkey_patches/string.rb
|
65
|
+
- lib/monkey_patches/uri.rb
|
66
|
+
- lib/url_canonicalize.rb
|
67
|
+
- lib/url_canonicalize/exception.rb
|
68
|
+
- lib/url_canonicalize/http.rb
|
69
|
+
- lib/url_canonicalize/request.rb
|
70
|
+
- lib/url_canonicalize/response.rb
|
71
|
+
- lib/url_canonicalize/uri.rb
|
72
|
+
- lib/url_canonicalize/version.rb
|
73
|
+
- url_canonicalize.gemspec
|
74
|
+
homepage: https://github.com/Xenapto/url_canonicalize
|
75
|
+
licenses:
|
76
|
+
- MIT
|
77
|
+
metadata: {}
|
78
|
+
post_install_message:
|
79
|
+
rdoc_options: []
|
80
|
+
require_paths:
|
81
|
+
- lib
|
82
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - ">="
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: '0'
|
92
|
+
requirements: []
|
93
|
+
rubyforge_project:
|
94
|
+
rubygems_version: 2.6.7
|
95
|
+
signing_key:
|
96
|
+
specification_version: 4
|
97
|
+
summary: Finds the canonical version of a URL
|
98
|
+
test_files: []
|