url_canonicalize 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3b018935dfe00ea2df8f9265b439a846141a8b03
4
+ data.tar.gz: 0e103350c0c91b1ae261dd2996b112ef35aa6790
5
+ SHA512:
6
+ metadata.gz: e30f6a713c580cb396ccf9b717a32b8b56ff61f8aa8b90ab9fff6981383e7d78a90f4c343ae6cd2cb492aea745601c5c195b2cfd4849d16afe8a01b24d9b5672
7
+ data.tar.gz: 068cf183a3d2050de607f5d098cdf37defe6dc68ec3210f67c3d8fce045e63d4129bc79b870f42a25755594572558a9e70272aff421a476eb84c76e851784979
data/.gitignore ADDED
@@ -0,0 +1,51 @@
1
+ *.local
2
+ *.gem
3
+ *.rbc
4
+ /.config
5
+ /coverage/
6
+ /InstalledFiles
7
+ /pkg/
8
+ /spec/reports/
9
+ /spec/examples.txt
10
+ /test/tmp/
11
+ /test/version_tmp/
12
+ /tmp/
13
+
14
+ # Used by dotenv library to load environment variables.
15
+ # .env
16
+
17
+ ## Specific to RubyMotion:
18
+ .dat*
19
+ .repl_history
20
+ build/
21
+ *.bridgesupport
22
+ build-iPhoneOS/
23
+ build-iPhoneSimulator/
24
+
25
+ ## Specific to RubyMotion (use of CocoaPods):
26
+ #
27
+ # We recommend against adding the Pods directory to your .gitignore. However
28
+ # you should judge for yourself, the pros and cons are mentioned at:
29
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
30
+ #
31
+ # vendor/Pods/
32
+
33
+ ## Documentation cache and generated files:
34
+ /.yardoc/
35
+ /_yardoc/
36
+ /doc/
37
+ /rdoc/
38
+
39
+ ## Environment normalization:
40
+ /.bundle/
41
+ /vendor/bundle
42
+ /lib/bundler/man/
43
+
44
+ # for a library or gem, you might want to ignore these files since the code is
45
+ # intended to run in multiple environments; otherwise, check them in:
46
+ # Gemfile.lock
47
+ # .ruby-version
48
+ # .ruby-gemset
49
+
50
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
51
+ .rvmrc
data/.hound.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ ruby:
3
+ enabled: true
4
+ config_file: .rubocop.yml
data/.rspec ADDED
@@ -0,0 +1,5 @@
1
+ --color
2
+ --require spec_helper
3
+ --profile
4
+ --format progress
5
+ --format RspecJunitFormatter --out <%= ENV.fetch('CIRCLE_TEST_REPORTS', 'tmp') %>/rspec/rspec.xml
data/.rubocop.yml ADDED
@@ -0,0 +1,55 @@
1
+ ---
2
+ AllCops:
3
+ Exclude:
4
+ - '*.gemspec'
5
+
6
+ Style/TrailingCommaInArguments:
7
+ EnforcedStyleForMultiline: no_comma
8
+ Enabled: true
9
+
10
+ StringLiterals:
11
+ EnforcedStyle: single_quotes
12
+ Enabled: true
13
+
14
+ LineLength:
15
+ Max: 120
16
+ Exclude:
17
+ - 'spec/**/*'
18
+
19
+ MethodLength:
20
+ Max: 12
21
+
22
+ DotPosition:
23
+ Description: 'Checks the position of the dot in multi-line method calls.'
24
+ EnforcedStyle: leading
25
+ Enabled: true
26
+
27
+ ClassAndModuleChildren:
28
+ Description: 'Checks style of children classes and modules.'
29
+ EnforcedStyle: nested
30
+ Enabled: true
31
+
32
+ Documentation:
33
+ Description: 'Document classes and non-namespace modules.'
34
+ Enabled: false
35
+
36
+ FileName:
37
+ Description: 'Use snake_case for source file names.'
38
+ Enabled: true
39
+
40
+ Style/SymbolArray:
41
+ Description: 'Use %i or %I for arrays of symbols.'
42
+ StyleGuide: 'https://github.com/bbatsov/ruby-style-guide#percent-i'
43
+ Enabled: false # Only available in Ruby 2.0+
44
+
45
+ Style/ExtraSpacing:
46
+ Description: 'Do not use unnecessary spacing.'
47
+ Enabled: true
48
+
49
+ Lint/LiteralInInterpolation:
50
+ Description: 'Avoid interpolating literals in strings'
51
+ AutoCorrect: true
52
+
53
+ Metrics/ClassLength:
54
+ CountComments: false # count full line comments?
55
+ Max: 120
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ url_canonicalize
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.9
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'rspec'
7
+ gem 'rspec_junit_formatter'
8
+ gem 'webmock'
9
+ gem 'simplecov'
10
+ gem 'coveralls', require: false
11
+ end
12
+
13
+ local_gemfile = 'Gemfile.local'
14
+
15
+ if File.exist?(local_gemfile)
16
+ eval(File.read(local_gemfile)) # rubocop:disable Lint/Eval
17
+ end
@@ -0,0 +1,11 @@
1
+ # Copy this file to a file named Gemfile.local to add these gems to your dev toolset
2
+ # Feel free to modify Gemfile.local to suit your own preferences
3
+ group :development do
4
+ gem 'rake'
5
+ gem 'gem-release'
6
+ gem 'rubocop'
7
+ gem 'listen', '~> 3.0', '< 3.1' # Dependency of guard, 3.1 requires Ruby 2.2+
8
+ gem 'guard'
9
+ gem 'guard-rspec'
10
+ gem 'guard-rubocop'
11
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,131 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ url_canonicalize (0.0.1)
5
+ addressable (~> 2)
6
+ nokogiri (~> 1)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ addressable (2.4.0)
12
+ ast (2.3.0)
13
+ builder (3.2.2)
14
+ coderay (1.1.1)
15
+ coveralls (0.8.15)
16
+ json (>= 1.8, < 3)
17
+ simplecov (~> 0.12.0)
18
+ term-ansicolor (~> 1.3)
19
+ thor (~> 0.19.1)
20
+ tins (>= 1.6.0, < 2)
21
+ crack (0.4.3)
22
+ safe_yaml (~> 1.0.0)
23
+ diff-lcs (1.2.5)
24
+ docile (1.1.5)
25
+ ffi (1.9.14)
26
+ formatador (0.2.5)
27
+ gem-release (0.7.4)
28
+ guard (2.14.0)
29
+ formatador (>= 0.2.4)
30
+ listen (>= 2.7, < 4.0)
31
+ lumberjack (~> 1.0)
32
+ nenv (~> 0.1)
33
+ notiffany (~> 0.0)
34
+ pry (>= 0.9.12)
35
+ shellany (~> 0.0)
36
+ thor (>= 0.18.1)
37
+ guard-compat (1.2.1)
38
+ guard-rspec (4.7.3)
39
+ guard (~> 2.1)
40
+ guard-compat (~> 1.1)
41
+ rspec (>= 2.99.0, < 4.0)
42
+ guard-rubocop (1.2.0)
43
+ guard (~> 2.0)
44
+ rubocop (~> 0.20)
45
+ hashdiff (0.3.0)
46
+ json (2.0.2)
47
+ listen (3.0.8)
48
+ rb-fsevent (~> 0.9, >= 0.9.4)
49
+ rb-inotify (~> 0.9, >= 0.9.7)
50
+ lumberjack (1.0.10)
51
+ method_source (0.8.2)
52
+ mini_portile2 (2.1.0)
53
+ nenv (0.3.0)
54
+ nokogiri (1.6.8.1)
55
+ mini_portile2 (~> 2.1.0)
56
+ notiffany (0.1.1)
57
+ nenv (~> 0.1)
58
+ shellany (~> 0.0)
59
+ parser (2.3.1.4)
60
+ ast (~> 2.2)
61
+ powerpack (0.1.1)
62
+ pry (0.10.4)
63
+ coderay (~> 1.1.0)
64
+ method_source (~> 0.8.1)
65
+ slop (~> 3.4)
66
+ rainbow (2.1.0)
67
+ rake (11.3.0)
68
+ rb-fsevent (0.9.7)
69
+ rb-inotify (0.9.7)
70
+ ffi (>= 0.5.0)
71
+ rspec (3.5.0)
72
+ rspec-core (~> 3.5.0)
73
+ rspec-expectations (~> 3.5.0)
74
+ rspec-mocks (~> 3.5.0)
75
+ rspec-core (3.5.4)
76
+ rspec-support (~> 3.5.0)
77
+ rspec-expectations (3.5.0)
78
+ diff-lcs (>= 1.2.0, < 2.0)
79
+ rspec-support (~> 3.5.0)
80
+ rspec-mocks (3.5.0)
81
+ diff-lcs (>= 1.2.0, < 2.0)
82
+ rspec-support (~> 3.5.0)
83
+ rspec-support (3.5.0)
84
+ rspec_junit_formatter (0.2.3)
85
+ builder (< 4)
86
+ rspec-core (>= 2, < 4, != 2.12.0)
87
+ rubocop (0.44.1)
88
+ parser (>= 2.3.1.1, < 3.0)
89
+ powerpack (~> 0.1)
90
+ rainbow (>= 1.99.1, < 3.0)
91
+ ruby-progressbar (~> 1.7)
92
+ unicode-display_width (~> 1.0, >= 1.0.1)
93
+ ruby-progressbar (1.8.1)
94
+ safe_yaml (1.0.4)
95
+ shellany (0.0.1)
96
+ simplecov (0.12.0)
97
+ docile (~> 1.1.0)
98
+ json (>= 1.8, < 3)
99
+ simplecov-html (~> 0.10.0)
100
+ simplecov-html (0.10.0)
101
+ slop (3.6.0)
102
+ term-ansicolor (1.4.0)
103
+ tins (~> 1.0)
104
+ thor (0.19.1)
105
+ tins (1.12.0)
106
+ unicode-display_width (1.1.1)
107
+ webmock (2.1.0)
108
+ addressable (>= 2.3.6)
109
+ crack (>= 0.3.2)
110
+ hashdiff
111
+
112
+ PLATFORMS
113
+ ruby
114
+
115
+ DEPENDENCIES
116
+ coveralls
117
+ gem-release
118
+ guard
119
+ guard-rspec
120
+ guard-rubocop
121
+ listen (~> 3.0, < 3.1)
122
+ rake
123
+ rspec
124
+ rspec_junit_formatter
125
+ rubocop
126
+ simplecov
127
+ url_canonicalize!
128
+ webmock
129
+
130
+ BUNDLED WITH
131
+ 1.13.5
data/Guardfile ADDED
@@ -0,0 +1,16 @@
1
+ guard :rubocop do
2
+ watch(/.+\.rb$/)
3
+ watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
4
+ end
5
+
6
+ guard(
7
+ :rspec,
8
+ all_after_pass: false,
9
+ all_on_start: false,
10
+ cmd: 'NO_SIMPLECOV=true bundle exec rspec --fail-fast --format documentation'
11
+ ) do
12
+ watch(%r{spec/.+_spec\.rb$})
13
+ watch(%r{lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
14
+ watch('spec/spec_helper.rb') { 'spec' }
15
+ watch(%r{^spec/support/.+\.rb$}) { 'spec' }
16
+ end
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016 Xenapto
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,32 @@
1
+ # URLCanonicalize
2
+
3
+ URLCanonicalize is a Ruby gem that finds the canonical version of a URL. It
4
+ provides `canonicalize` methods for the String, URI::HTTP, URI::HTTPS and
5
+ Addressable::URI classes.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'url_canonicalize'
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ 'http://www.twitter.com'.canonicalize # => 'https://twitter.com/'
19
+ URI('http://www.twitter.com').canonicalize # => #<URI::HTTP:0x00000008767908 URL:https://twitter.com/>
20
+ Addressable::URI.canonicalize('http://www.twitter.com') # => #<Addressable::URI:0x43c9 URI:https://twitter.com/>
21
+ ```
22
+
23
+ ## More Information
24
+
25
+ URLCanonical follows HTTP redirects and also looks for `rel="canonical"` hints
26
+ in both the HTTP headers and the `<head>` section of the response HTML. The URL
27
+ it returns will be both normalized and canonical. The intention is that
28
+ whatever variant of a URL is supplied the result will always be the same. The
29
+ intended use case is for applications that need to dedupe a list of URLs, for
30
+ instance to check if a new URL is already present in a list. If the list is
31
+ built from canonicalized URLs then the resulting set will have fewer URLs that
32
+ point to the same ultimate resource.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
data/circle.yml ADDED
@@ -0,0 +1,18 @@
1
+ machine:
2
+ ruby:
3
+ version: 2.1.9
4
+
5
+ dependencies:
6
+ pre:
7
+ - echo "export rvm_ignore_gemsets_flag=1" >> ~/.rvmrc
8
+ - gem install bundler
9
+ override:
10
+ - bundle check --path=vendor/bundle || bundle install --path=vendor/bundle --jobs=4 --retry=3 --full-index
11
+
12
+ test:
13
+ override:
14
+ - bundle exec rspec:
15
+ timeout: 600
16
+ parallel: true
17
+ files:
18
+ - spec/**/*_spec.rb
@@ -0,0 +1,10 @@
1
+ module Addressable
2
+ # Patch for Addressable's URI class
3
+ class URI
4
+ def self.canonicalize(uri)
5
+ url = parse(uri).to_s # uri can be anything Addressable::URI can handle
6
+ canonical_url = URLCanonicalize.canonicalize(url)
7
+ parse(canonical_url)
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,21 @@
1
+ # Patch for Ruby's String class
2
+ class String
3
+ def canonicalize
4
+ URLCanonicalize.canonicalize(self)
5
+ end
6
+
7
+ def ansi_attributes(*args)
8
+ "\e[#{args.join(';')}m#{self}\e[0m"
9
+ end
10
+
11
+ colors = %w(black red green yellow blue magenta cyan white)
12
+
13
+ colors.each_with_index do |fg_color, i|
14
+ fg = 30 + i
15
+ define_method(fg_color) { ansi_attributes(fg) }
16
+
17
+ colors.each_with_index do |bg_color, j|
18
+ define_method("#{fg_color}_on_#{bg_color}") { ansi_attributes(fg, 40 + j) }
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ module URI
2
+ # URI having the HTTP protocol
3
+ class HTTP
4
+ def canonicalize
5
+ new_url = URLCanonicalize.canonicalize(to_s)
6
+ ::URI.parse(new_url)
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ # Core methods
2
+ module URLCanonicalize
3
+ # Local exception classes to make handling exceptions easier
4
+ class Exception < RuntimeError
5
+ URI = Class.new(self)
6
+ Redirect = Class.new(self)
7
+ Failure = Class.new(self)
8
+ end
9
+ end
@@ -0,0 +1,155 @@
1
+ module URLCanonicalize
2
+ # Persistent connection for possible repeated requests to the same host
3
+ class HTTP
4
+ def fetch
5
+ loop { break last_known_good if handle_response }
6
+ end
7
+
8
+ def uri
9
+ @uri ||= URLCanonicalize::URI.parse(url) # Malformed URLs will raise a URLCanonicalize exception
10
+ end
11
+
12
+ def url=(value)
13
+ @url = value.to_s
14
+ @uri = nil
15
+ end
16
+
17
+ def request(request_object)
18
+ http.request request_object
19
+ end
20
+
21
+ private
22
+
23
+ attr_reader :last_known_good
24
+
25
+ def initialize(raw_url)
26
+ @raw_url = raw_url
27
+ end
28
+
29
+ # Fetch the response
30
+ def response
31
+ @response ||= Request.new(self).fetch
32
+ end
33
+
34
+ # Parse the response, and clear the response ready to follow the next redirect
35
+ def handle_response
36
+ result = parse_response
37
+ @response = nil
38
+ result
39
+ end
40
+
41
+ # Parse the response
42
+ def parse_response
43
+ case response
44
+ when Net::HTTPSuccess
45
+ handle_success
46
+ when URLCanonicalize::Response::Redirect
47
+ redirect_loop_detected? || max_redirects_reached?
48
+ when URLCanonicalize::Response::CanonicalFound
49
+ handle_canonical_found
50
+ when URLCanonicalize::Response::Failure
51
+ handle_failure
52
+ else
53
+ handle_unhandled_response
54
+ end
55
+ end
56
+
57
+ def redirect_loop_detected?
58
+ if redirect_list.include?(response.url)
59
+ return true if last_known_good
60
+ raise URLCanonicalize::Exception::Redirect, 'Redirect loop detected'
61
+ end
62
+
63
+ redirect_list << response.url
64
+ increment_redirects
65
+ set_url_from_response
66
+ false
67
+ end
68
+
69
+ def max_redirects_reached?
70
+ return false unless @redirects > options[:max_redirects]
71
+ return true if last_known_good
72
+ raise URLCanonicalize::Exception::Redirect, "#{@redirects} redirects is too many"
73
+ end
74
+
75
+ def redirect_list
76
+ @redirect_list ||= []
77
+ end
78
+
79
+ def increment_redirects
80
+ @redirects = redirects + 1
81
+ end
82
+
83
+ def redirects
84
+ @redirects ||= 0
85
+ end
86
+
87
+ def handle_canonical_found
88
+ @last_known_good = response.response
89
+ return true if response.url == url || redirect_list.include?(response.url)
90
+ set_url_from_response
91
+ false
92
+ end
93
+
94
+ def set_url_from_response
95
+ self.url = response.url
96
+ end
97
+
98
+ def handle_failure
99
+ return true if last_known_good
100
+ raise URLCanonicalize::Exception::Failure, "#{response.failure_class}: #{response.message}"
101
+ end
102
+
103
+ def handle_unhandled_response
104
+ raise URLCanonicalize::Exception::Failure, "Unhandled response type: #{response.class}"
105
+ end
106
+
107
+ def handle_success
108
+ @last_known_good = response
109
+ true
110
+ end
111
+
112
+ def url
113
+ @url ||= @raw_url.to_s
114
+ end
115
+
116
+ def http
117
+ return @http if same_host_and_port # reuse connection
118
+
119
+ @previous = uri
120
+ @http = new_http
121
+ end
122
+
123
+ def same_host_and_port
124
+ uri.host == previous.host && uri.port == previous.port
125
+ end
126
+
127
+ def previous
128
+ @previous ||= Struct.new(:host, :port).new
129
+ end
130
+
131
+ def new_http
132
+ h = Net::HTTP.new uri.host, uri.port
133
+
134
+ h.open_timeout = options[:open_timeout]
135
+ h.read_timeout = options[:read_timeout]
136
+
137
+ if uri.scheme == 'https'
138
+ h.use_ssl = true # Can generate exception
139
+ h.verify_mode = OpenSSL::SSL::VERIFY_NONE
140
+ else
141
+ h.use_ssl = false
142
+ end
143
+
144
+ h
145
+ end
146
+
147
+ def options
148
+ @options ||= {
149
+ open_timeout: 8, # Twitter responds in >5s
150
+ read_timeout: 15,
151
+ max_redirects: 10
152
+ }
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,144 @@
1
+ module URLCanonicalize
2
+ # Make an HTTP request
3
+ class Request
4
+ def fetch
5
+ handle_response
6
+ end
7
+
8
+ private
9
+
10
+ attr_reader :http, :http_method
11
+
12
+ def initialize(http, http_method = :head)
13
+ @http = http
14
+ @http_method = http_method
15
+ end
16
+
17
+ def response
18
+ @response ||= http.request request # Some URLs can throw an exception here
19
+ end
20
+
21
+ def request
22
+ @request ||= request_for_method
23
+ end
24
+
25
+ def handle_response
26
+ case response
27
+ when Net::HTTPSuccess
28
+ look_for_canonical
29
+ when Net::HTTPRedirection
30
+ handle_redirection
31
+ else
32
+ handle_failure
33
+ end
34
+ rescue *NETWORK_EXCEPTIONS => e
35
+ handle_failure(e.class, e.message)
36
+ end
37
+
38
+ def look_for_canonical
39
+ # Look in response Link header
40
+ if response['link'] =~ /<(?<url>.+)>\s*;\s*rel="canonical"/i
41
+ URLCanonicalize::Response::CanonicalFound.new($LAST_MATCH_INFO['url'])
42
+ elsif http_method == :head
43
+ self.http_method = :get
44
+ fetch
45
+ else
46
+ canonical_url ? URLCanonicalize::Response::CanonicalFound.new(canonical_url, response) : response
47
+ end
48
+ end
49
+
50
+ def handle_redirection
51
+ case response
52
+ when Net::HTTPFound, Net::HTTPMovedTemporarily, Net::HTTPTemporaryRedirect
53
+ self.http_method = :get
54
+ look_for_canonical
55
+ else
56
+ URLCanonicalize::Response::Redirect.new(response['location'])
57
+ end
58
+ end
59
+
60
+ def handle_failure(klass = response.class, message = response.message)
61
+ URLCanonicalize::Response::Failure.new(klass, message)
62
+ end
63
+
64
+ def html
65
+ @html ||= Nokogiri::HTML response.body
66
+ end
67
+
68
+ def canonical_url_element
69
+ @canonical_url_element ||= html.xpath('//head/link[@rel="canonical"]').first
70
+ end
71
+
72
+ def canonical_url
73
+ @canonical_url ||= canonical_url_element['href'] if @canonical_url_element.is_a?(Nokogiri::XML::Element)
74
+ end
75
+
76
+ def uri
77
+ @uri ||= http.uri
78
+ end
79
+
80
+ def url
81
+ @url ||= uri.to_s
82
+ end
83
+
84
+ def host
85
+ @host ||= uri.host
86
+ end
87
+
88
+ def request_for_method
89
+ r = base_request
90
+ headers.each { |header_key, header_value| r[header_key] = header_value }
91
+ r
92
+ end
93
+
94
+ def base_request
95
+ check_http_method
96
+
97
+ case http_method
98
+ when :head
99
+ Net::HTTP::Head.new uri
100
+ when :get
101
+ Net::HTTP::Get.new uri
102
+ else
103
+ raise URLCanonicalize::Exception::Request, "Unknown method: #{method}"
104
+ end
105
+ end
106
+
107
+ def headers
108
+ @headers ||= {
109
+ 'Accept-Language' => 'en-US,en;q=0.8',
110
+ 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; WOW64) '\
111
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '\
112
+ 'Chrome/51.0.2704.103 Safari/537.36'
113
+ }
114
+ end
115
+
116
+ def http_method=(value)
117
+ @http_method = value
118
+ @request = nil
119
+ @response = nil
120
+ end
121
+
122
+ # Some sites treat HEAD requests as suspicious activity and block the
123
+ # requester after a few attempts. For these sites we'll use GET requests
124
+ # only
125
+ def check_http_method
126
+ @http_method = :get if host =~ /(linkedin|crunchbase).com/
127
+ end
128
+
129
+ NETWORK_EXCEPTIONS = [
130
+ EOFError,
131
+ Errno::ECONNREFUSED,
132
+ Errno::ECONNRESET,
133
+ Errno::EHOSTUNREACH,
134
+ Errno::EINVAL,
135
+ Errno::ENETUNREACH,
136
+ Errno::ETIMEDOUT,
137
+ Net::OpenTimeout,
138
+ Net::ReadTimeout,
139
+ OpenSSL::SSL::SSLError,
140
+ SocketError,
141
+ Timeout::Error
142
+ ].freeze
143
+ end
144
+ end
@@ -0,0 +1,39 @@
1
+ module URLCanonicalize
2
+ # The response from an HTTP request
3
+ module Response
4
+ class Generic
5
+ attr_reader :url
6
+
7
+ private
8
+
9
+ def initialize(url)
10
+ @url = url
11
+ end
12
+ end
13
+
14
+ Redirect = Class.new(Generic)
15
+
16
+ class CanonicalFound < Generic
17
+ attr_reader :response
18
+
19
+ private
20
+
21
+ def initialize(url, response)
22
+ @url = url
23
+ @response = response
24
+ end
25
+ end
26
+
27
+ # It barfed
28
+ class Failure
29
+ attr_reader :failure_class, :message
30
+
31
+ private
32
+
33
+ def initialize(failure_class, message)
34
+ @failure_class = failure_class
35
+ @message = message
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,25 @@
1
+ module URLCanonicalize
2
+ # Manage the URL into a URI with local exception handling
3
+ class URI
4
+ class << self
5
+ def parse(url)
6
+ uri = ::URI.parse(url)
7
+ uri if valid?(uri)
8
+ rescue ::URI::InvalidURIError => e
9
+ new_exception = URLCanonicalize::Exception::URI.new("#{e.class}: #{e.message}")
10
+ new_exception.set_backtrace e.backtrace
11
+ raise new_exception
12
+ end
13
+
14
+ private
15
+
16
+ def valid?(uri)
17
+ raise URLCanonicalize::Exception::URI, "#{uri} must be http or https" unless VALID_CLASSES.include?(uri.class)
18
+ raise URLCanonicalize::Exception::URI, "Missing host name in #{uri}" unless uri.host
19
+ true
20
+ end
21
+
22
+ VALID_CLASSES = [::URI::HTTP, ::URI::HTTPS].freeze
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module URLCanonicalize
2
+ VERSION = '0.0.1'.freeze
3
+ end
@@ -0,0 +1,30 @@
1
+ require 'uri'
2
+ require 'addressable/uri'
3
+ require 'net/http'
4
+ require 'nokogiri'
5
+
6
+ autoload :OpenSSL, 'openssl'
7
+
8
+ # Core methods
9
+ module URLCanonicalize
10
+ autoload :Exception, 'url_canonicalize/exception'
11
+ autoload :HTTP, 'url_canonicalize/http'
12
+ autoload :Request, 'url_canonicalize/request'
13
+ autoload :Response, 'url_canonicalize/response'
14
+ autoload :URI, 'url_canonicalize/uri'
15
+ autoload :VERSION, 'url_canonicalize/version'
16
+
17
+ class << self
18
+ def canonicalize(url)
19
+ fetch(url).uri.to_s
20
+ end
21
+
22
+ def fetch(url)
23
+ URLCanonicalize::HTTP.new(url).fetch
24
+ end
25
+ end
26
+ end
27
+
28
+ require 'monkey_patches/uri'
29
+ require 'monkey_patches/string'
30
+ require 'monkey_patches/addressable/uri'
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'url_canonicalize/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = 'url_canonicalize'
8
+ s.version = URLCanonicalize::VERSION
9
+ s.authors = ['Dominic Sayers']
10
+ s.email = ['developers@xenapto.com']
11
+ s.summary = 'Finds the canonical version of a URL'
12
+ s.description = 'Rubygem that finds the canonical version of a URL by '\
13
+ 'providing #canonicalize methods for the String, URI::HTTP'\
14
+ ', URI::HTTPS and Addressable::URI classes'
15
+ s.homepage = 'https://github.com/Xenapto/url_canonicalize'
16
+ s.license = 'MIT'
17
+
18
+ s.files = `git ls-files`.split($RS).reject do |file|
19
+ file =~ /^spec\//
20
+ end
21
+
22
+ s.test_files = []
23
+ s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
24
+ s.require_paths = ['lib']
25
+
26
+ s.add_dependency 'addressable', '~> 2' # To normalize URLs
27
+ s.add_dependency 'nokogiri', '~> 1' # To look for <link rel="canonical" ...> in HTML
28
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_canonicalize
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Dominic Sayers
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-10-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1'
41
+ description: 'Rubygem that finds the canonical version of a URL by providing #canonicalize
42
+ methods for the String, URI::HTTP, URI::HTTPS and Addressable::URI classes'
43
+ email:
44
+ - developers@xenapto.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - ".hound.yml"
51
+ - ".rspec"
52
+ - ".rubocop.yml"
53
+ - ".ruby-gemset"
54
+ - ".ruby-version"
55
+ - Gemfile
56
+ - Gemfile.local.example
57
+ - Gemfile.lock
58
+ - Guardfile
59
+ - LICENSE
60
+ - README.md
61
+ - Rakefile
62
+ - circle.yml
63
+ - lib/monkey_patches/addressable/uri.rb
64
+ - lib/monkey_patches/string.rb
65
+ - lib/monkey_patches/uri.rb
66
+ - lib/url_canonicalize.rb
67
+ - lib/url_canonicalize/exception.rb
68
+ - lib/url_canonicalize/http.rb
69
+ - lib/url_canonicalize/request.rb
70
+ - lib/url_canonicalize/response.rb
71
+ - lib/url_canonicalize/uri.rb
72
+ - lib/url_canonicalize/version.rb
73
+ - url_canonicalize.gemspec
74
+ homepage: https://github.com/Xenapto/url_canonicalize
75
+ licenses:
76
+ - MIT
77
+ metadata: {}
78
+ post_install_message:
79
+ rdoc_options: []
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ requirements: []
93
+ rubyforge_project:
94
+ rubygems_version: 2.6.7
95
+ signing_key:
96
+ specification_version: 4
97
+ summary: Finds the canonical version of a URL
98
+ test_files: []