url_normalizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f3d1f7c14baac847e69e2183e034ba1f264eb427
4
+ data.tar.gz: 3d9b4927ab8615bdd3f5b627eb506a7f318ffdf6
5
+ SHA512:
6
+ metadata.gz: c3869cc402f2d7cb00328fdcfe232fd91f1c99844230f3ce119e7052d41b75c6855a620cd858373e857d73d61debe83648eb1a81bd7d0d77f6803c096f210aa3
7
+ data.tar.gz: 87c6f90f9f09c484c7db6a0b78a9f12a97f924d65b3839cd4eaa1310c41a3a036f71c0c64e7c00f34c57828d748acddecab1c457503c069cee0787ad2af132ad
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Factlink, Inc
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # UrlNormalizer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'url_normalizer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install url_normalizer
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,83 @@
1
+ require "url_normalizer/version"
2
+ require 'addressable/uri'
3
+ require 'cgi'
4
+
5
+ class UrlNormalizer
6
+ @@normalizer_for = Hash.new(UrlNormalizer)
7
+
8
+ def self.normalize_for domain
9
+ @@normalizer_for[domain] = self
10
+ end
11
+
12
+ def self.normalize url
13
+ url.sub!(/#(?!\!)[^#]*$/,'')
14
+
15
+ uri = Addressable::URI.parse(url)
16
+
17
+ @@normalizer_for[uri.host].new(uri).normalize
18
+ end
19
+
20
+ def initialize uri
21
+ @uri = uri
22
+ end
23
+
24
+ def normalize
25
+ uri = @uri
26
+
27
+ uri.query = clean_query(uri.query)
28
+ uri.normalize!
29
+
30
+ url = uri.to_s
31
+ url.sub(/\?$/,'')
32
+ end
33
+
34
+ def forbidden_uri_params
35
+ [:utm_source, :utm_content, :utm_medium, :utm_campaign]
36
+ end
37
+
38
+ def whitelisted_uri_params
39
+ nil
40
+ end
41
+
42
+ def clean_query query
43
+ return unless query
44
+
45
+ uri_params = CGI.parse(query)
46
+
47
+ forbidden_params = forbidden_uri_params.map(&:to_s)
48
+ if forbidden_params
49
+ uri_params.reject! {|k,v| forbidden_params.include? k}
50
+ end
51
+
52
+ allowed_params = whitelisted_uri_params.andand.map(&:to_s)
53
+ if allowed_params
54
+ uri_params.select! {|k,v| allowed_params.include? k}
55
+ end
56
+
57
+ build_query(uri_params)
58
+ end
59
+
60
+ def encode_component component
61
+ Addressable::URI.encode_component component
62
+ end
63
+
64
+ def build_query(params)
65
+ params.map do |name,values|
66
+ escaped_name = encode_component name
67
+ if values.length > 0
68
+ values.map do |value|
69
+ escaped_value = encode_component value
70
+ "#{escaped_name}=#{escaped_value}"
71
+ end
72
+ else
73
+ ["#{escaped_name}"]
74
+ end
75
+ end.flatten.join("&")
76
+ end
77
+ end
78
+
79
+ require_relative 'url_normalizer/proxy'
80
+ require_relative 'url_normalizer/new_york_times'
81
+ require_relative 'url_normalizer/think_progress'
82
+ require_relative 'url_normalizer/linkedin'
83
+ require_relative 'url_normalizer/newyorker'
@@ -0,0 +1,10 @@
1
+ class UrlNormalizer
2
+ class LinkedIn < UrlNormalizer
3
+ normalize_for 'linkedin.com'
4
+ normalize_for 'www.linkedin.com'
5
+
6
+ def forbidden_uri_params
7
+ super + [:ref]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ class UrlNormalizer
2
+ class NewYorkTimes < UrlNormalizer
3
+ normalize_for 'nytimes.com'
4
+ normalize_for 'www.nytimes.com'
5
+
6
+ def forbidden_uri_params
7
+ super + [:_r]
8
+ end
9
+
10
+ def whitelisted_uri_params
11
+ [:pagewanted]
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,10 @@
1
+ class UrlNormalizer
2
+ class Newyorker < UrlNormalizer
3
+ normalize_for 'newyorker.com'
4
+ normalize_for 'www.newyorker.com'
5
+
6
+ def forbidden_uri_params
7
+ super + [:mobify]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,16 @@
1
+ require 'andand'
2
+
3
+ class UrlNormalizer
4
+ class Proxy < UrlNormalizer
5
+ normalize_for 'fct.li'
6
+
7
+ def normalize
8
+ url = @uri.query && CGI.parse(@uri.query)['url'].andand[0]
9
+ if url
10
+ UrlNormalizer.normalize url
11
+ else
12
+ super
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ class UrlNormalizer
2
+ class ThinkProgress < UrlNormalizer
3
+ normalize_for 'thinkprogress.org'
4
+ normalize_for 'www.thinkprogress.org'
5
+
6
+ def forbidden_uri_params
7
+ super + [:mobile]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,3 @@
1
+ class UrlNormalizer
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,15 @@
1
+ require_relative '../../lib/url_normalizer.rb'
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+ it do
11
+ normalized('http://www.linkedin.com/today/post/article/20130131131416-174077701-lessons-from-my-bosses?ref=email').should ==
12
+ 'http://www.linkedin.com/today/post/article/20130131131416-174077701-lessons-from-my-bosses'
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,27 @@
1
+ require File.expand_path('../../../lib/url_normalizer.rb', __FILE__)
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+
11
+ let(:base){'http://www.nytimes.com/2011/10/30/opinion/mona-simpsons-eulogy-for-steve-jobs.html'}
12
+
13
+ it { normalized(base + '?_r=1').should == base }
14
+ it { normalized(base + '?_r=1&utm_source=frank').should == base }
15
+
16
+ it do
17
+ normalized(base + '?pagewanted=all&_r=0').should ==
18
+ base + '?pagewanted=all'
19
+ end
20
+
21
+ it do
22
+ normalized(base + '?pagewanted=2&_r=1utm_source=buffer&buffer_share=b0b26').should ==
23
+ base + '?pagewanted=2'
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path('../../../lib/url_normalizer.rb', __FILE__)
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+ it do
11
+ normalized('http://thinkprogress.org/politics/2012/08/17/705401/how-paul-ryans-budget-would-devastate-social-programs-for-todays-lower-income-americans/?mobile=nc').should ==
12
+ 'http://thinkprogress.org/politics/2012/08/17/705401/how-paul-ryans-budget-would-devastate-social-programs-for-todays-lower-income-americans/'
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,96 @@
1
+ require File.expand_path('../../lib/url_normalizer.rb', __FILE__)
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+
11
+ it { normalized('http://www.google.com/foo').should == 'http://www.google.com/foo' }
12
+ it { normalized('http://www.google.com/foo#bar').should == 'http://www.google.com/foo' }
13
+
14
+ ['utm_source', 'utm_medium', 'utm_source', 'utm_content', 'utm_campaign'].each do |strip_param|
15
+ it "should strip #{strip_param} if it is the only parameter" do
16
+ normalized("http://www.google.com/foo?#{strip_param}=bar").should == 'http://www.google.com/foo'
17
+ end
18
+ it "should strip #{strip_param} if there are other parameters" do
19
+ normalized("http://www.google.com/foo?#{strip_param}=bar&x=y").should == 'http://www.google.com/foo?x=y'
20
+ end
21
+ end
22
+
23
+ it { normalized( 'http://www.google.com/?x=y|z').should == 'http://www.google.com/?x=y%7Cz' }
24
+ it { normalized( 'http://www.google.com/?x=y|z').should == 'http://www.google.com/?x=y%7Cz' }
25
+
26
+ it { normalized( 'http://www.example.org/file\bier.png').should == 'http://www.example.org/file%5Cbier.png' }
27
+
28
+ it do
29
+ normalized( 'http://www.ikea.com/nl/nl/catalog/products/30101154/?icid=nl|ic|hp_main|smarteasyliving|ikea365').should ==
30
+ 'http://www.ikea.com/nl/nl/catalog/products/30101154/?icid=nl%7Cic%7Chp_main%7Csmarteasyliving%7Cikea365'
31
+ end
32
+
33
+ it do
34
+ normalized( 'http://www.amazon.de/s/url=search-alias%3Daps').should ==
35
+ 'http://www.amazon.de/s/url=search-alias=aps'
36
+ end
37
+
38
+ it do
39
+ normalized( 'http://www.amazon.de/s/?url=search-alias%3Daps').should ==
40
+ 'http://www.amazon.de/s/?url=search-alias=aps'
41
+ end
42
+
43
+ it do
44
+ normalized( 'http://www.newyorker.com/online/blogs/borowitzreport/2013/08/amazon-founder-says-he-clicked-on-washington-post-by-mistake.html?mobify=0&utm_source=buffer&utm_campaign=Buffer&utm_content=buffer1526d&utm_medium=twitter').should ==
45
+ 'http://www.newyorker.com/online/blogs/borowitzreport/2013/08/amazon-founder-says-he-clicked-on-washington-post-by-mistake.html'
46
+ end
47
+
48
+ describe "improvements" do
49
+
50
+ it { normalized( 'http://www.google.com/a[b]').should == 'http://www.google.com/a%5Bb%5D' } # [ and ] are not allowed according to RFC 2732 http://www.ietf.org/rfc/rfc2732.txt
51
+ it { normalized( 'http://www.google.com/foo?bar=bax|zuup').should == 'http://www.google.com/foo?bar=bax%7Czuup' }
52
+
53
+ describe "normalizing proxy urls" do
54
+ it { normalized( "http://fct.li/parse?url=http%3A%2F%2Fwww.google.com&factlinkModus=default").should == "http://www.google.com/" }
55
+ end
56
+ describe "it should work on normal proxy urls" do
57
+ it { normalized( "http://fct.li/").should == "http://fct.li/" }
58
+ end
59
+ end
60
+ it {normalized( 'http://www.example.com/ff/entry.asp?123').should == 'http://www.example.com/ff/entry.asp?123'}
61
+
62
+
63
+ it {normalized( 'http://example.com/foo?x=^y').should == 'http://example.com/foo?x=%5Ey'}
64
+ it {normalized( 'http://example.com/foo?x=%y').should == 'http://example.com/foo?x=%25y'}
65
+ it {normalized( 'http://example.com/foo?x={y}').should == 'http://example.com/foo?x=%7By%7D'}
66
+ it {normalized( 'http://example.org/search.php?a=%F6').should == 'http://example.org/search.php?a=%F6'}
67
+ def decode_utf8_b64(string)
68
+ URI.unescape(string)
69
+ end
70
+ it "should not bug with invalid encodings" do
71
+ normalized( decode_utf8_b64("http%3A%2F%2Fwww.mercuryserver.com%2Fforums%2Fshowthread.php%3F100800-Stephan-Bodzin-amp-Marc-Romboy-%2596-Live-Luna-Live-Tour-(Harry-Klein-M%25FCnchen)-%2596-23-04-2")).should ==
72
+ "http://www.mercuryserver.com/forums/showthread.php?100800-Stephan-Bodzin-amp-Marc-Romboy-%96-Live-Luna-Live-Tour-(Harry-Klein-M%FCnchen)-%96-23-04-2"
73
+ end
74
+ describe 'xss protection' do
75
+ it "url encodes < and >" do
76
+ url = 'http://hoi/<>'
77
+ expect(normalized(url)).to eq 'http://hoi/%3C%3E'
78
+ end
79
+
80
+ it "url encodes \"" do
81
+ url = 'http://hoi/"'
82
+ expect(normalized(url)).to eq 'http://hoi/%22'
83
+ end
84
+
85
+ it "encodes all explicit spacing to a space" do
86
+ url = "http://hoi/%20a%09b%0Ac%0Dd"
87
+ expect(normalized(url)).to eq "http://hoi/%20a%09b%0Ac%0Dd"
88
+ end
89
+
90
+ it "leaves ' in the url (valid)" do
91
+ url = "http://hoi/'"
92
+ expect(normalized(url)).to eq "http://hoi/'"
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'url_normalizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "url_normalizer"
8
+ spec.version = UrlNormalizer::VERSION
9
+ spec.authors = ["Mark IJbema", "Martijn Russchen", "Tom de Vries"]
10
+ spec.email = ["markijbema+url_normalizer@gmail.com"]
11
+ spec.description = %q{Ruby gem to normalize urls}
12
+ spec.summary = %q{Ruby gem to normalize urls}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "andand"
22
+ spec.add_dependency "addressable"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec"
27
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_normalizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Mark IJbema
8
+ - Martijn Russchen
9
+ - Tom de Vries
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2013-09-29 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: andand
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: addressable
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: bundler
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ~>
48
+ - !ruby/object:Gem::Version
49
+ version: '1.3'
50
+ type: :development
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ~>
55
+ - !ruby/object:Gem::Version
56
+ version: '1.3'
57
+ - !ruby/object:Gem::Dependency
58
+ name: rake
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :development
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: rspec
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ description: Ruby gem to normalize urls
86
+ email:
87
+ - markijbema+url_normalizer@gmail.com
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - .gitignore
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - lib/url_normalizer.rb
98
+ - lib/url_normalizer/linkedin.rb
99
+ - lib/url_normalizer/new_york_times.rb
100
+ - lib/url_normalizer/newyorker.rb
101
+ - lib/url_normalizer/proxy.rb
102
+ - lib/url_normalizer/think_progress.rb
103
+ - lib/url_normalizer/version.rb
104
+ - spec/url_normalizer/linkedin_spec.rb
105
+ - spec/url_normalizer/new_york_times_spec.rb
106
+ - spec/url_normalizer/thinkprogress_spec.rb
107
+ - spec/url_normalizer_spec.rb
108
+ - url_normalizer.gemspec
109
+ homepage: ''
110
+ licenses:
111
+ - MIT
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 2.1.3
130
+ signing_key:
131
+ specification_version: 4
132
+ summary: Ruby gem to normalize urls
133
+ test_files:
134
+ - spec/url_normalizer/linkedin_spec.rb
135
+ - spec/url_normalizer/new_york_times_spec.rb
136
+ - spec/url_normalizer/thinkprogress_spec.rb
137
+ - spec/url_normalizer_spec.rb
138
+ has_rdoc: