url_normalizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: f3d1f7c14baac847e69e2183e034ba1f264eb427
4
+ data.tar.gz: 3d9b4927ab8615bdd3f5b627eb506a7f318ffdf6
5
+ SHA512:
6
+ metadata.gz: c3869cc402f2d7cb00328fdcfe232fd91f1c99844230f3ce119e7052d41b75c6855a620cd858373e857d73d61debe83648eb1a81bd7d0d77f6803c096f210aa3
7
+ data.tar.gz: 87c6f90f9f09c484c7db6a0b78a9f12a97f924d65b3839cd4eaa1310c41a3a036f71c0c64e7c00f34c57828d748acddecab1c457503c069cee0787ad2af132ad
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Factlink, Inc
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # UrlNormalizer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'url_normalizer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install url_normalizer
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,83 @@
1
+ require "url_normalizer/version"
2
+ require 'addressable/uri'
3
+ require 'cgi'
4
+
5
+ class UrlNormalizer
6
+ @@normalizer_for = Hash.new(UrlNormalizer)
7
+
8
+ def self.normalize_for domain
9
+ @@normalizer_for[domain] = self
10
+ end
11
+
12
+ def self.normalize url
13
+ url.sub!(/#(?!\!)[^#]*$/,'')
14
+
15
+ uri = Addressable::URI.parse(url)
16
+
17
+ @@normalizer_for[uri.host].new(uri).normalize
18
+ end
19
+
20
+ def initialize uri
21
+ @uri = uri
22
+ end
23
+
24
+ def normalize
25
+ uri = @uri
26
+
27
+ uri.query = clean_query(uri.query)
28
+ uri.normalize!
29
+
30
+ url = uri.to_s
31
+ url.sub(/\?$/,'')
32
+ end
33
+
34
+ def forbidden_uri_params
35
+ [:utm_source, :utm_content, :utm_medium, :utm_campaign]
36
+ end
37
+
38
+ def whitelisted_uri_params
39
+ nil
40
+ end
41
+
42
+ def clean_query query
43
+ return unless query
44
+
45
+ uri_params = CGI.parse(query)
46
+
47
+ forbidden_params = forbidden_uri_params.map(&:to_s)
48
+ if forbidden_params
49
+ uri_params.reject! {|k,v| forbidden_params.include? k}
50
+ end
51
+
52
+ allowed_params = whitelisted_uri_params.andand.map(&:to_s)
53
+ if allowed_params
54
+ uri_params.select! {|k,v| allowed_params.include? k}
55
+ end
56
+
57
+ build_query(uri_params)
58
+ end
59
+
60
+ def encode_component component
61
+ Addressable::URI.encode_component component
62
+ end
63
+
64
+ def build_query(params)
65
+ params.map do |name,values|
66
+ escaped_name = encode_component name
67
+ if values.length > 0
68
+ values.map do |value|
69
+ escaped_value = encode_component value
70
+ "#{escaped_name}=#{escaped_value}"
71
+ end
72
+ else
73
+ ["#{escaped_name}"]
74
+ end
75
+ end.flatten.join("&")
76
+ end
77
+ end
78
+
79
+ require_relative 'url_normalizer/proxy'
80
+ require_relative 'url_normalizer/new_york_times'
81
+ require_relative 'url_normalizer/think_progress'
82
+ require_relative 'url_normalizer/linkedin'
83
+ require_relative 'url_normalizer/newyorker'
@@ -0,0 +1,10 @@
1
+ class UrlNormalizer
2
+ class LinkedIn < UrlNormalizer
3
+ normalize_for 'linkedin.com'
4
+ normalize_for 'www.linkedin.com'
5
+
6
+ def forbidden_uri_params
7
+ super + [:ref]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ class UrlNormalizer
2
+ class NewYorkTimes < UrlNormalizer
3
+ normalize_for 'nytimes.com'
4
+ normalize_for 'www.nytimes.com'
5
+
6
+ def forbidden_uri_params
7
+ super + [:_r]
8
+ end
9
+
10
+ def whitelisted_uri_params
11
+ [:pagewanted]
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,10 @@
1
+ class UrlNormalizer
2
+ class Newyorker < UrlNormalizer
3
+ normalize_for 'newyorker.com'
4
+ normalize_for 'www.newyorker.com'
5
+
6
+ def forbidden_uri_params
7
+ super + [:mobify]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,16 @@
1
+ require 'andand'
2
+
3
+ class UrlNormalizer
4
+ class Proxy < UrlNormalizer
5
+ normalize_for 'fct.li'
6
+
7
+ def normalize
8
+ url = @uri.query && CGI.parse(@uri.query)['url'].andand[0]
9
+ if url
10
+ UrlNormalizer.normalize url
11
+ else
12
+ super
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ class UrlNormalizer
2
+ class ThinkProgress < UrlNormalizer
3
+ normalize_for 'thinkprogress.org'
4
+ normalize_for 'www.thinkprogress.org'
5
+
6
+ def forbidden_uri_params
7
+ super + [:mobile]
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,3 @@
1
+ class UrlNormalizer
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,15 @@
1
+ require_relative '../../lib/url_normalizer.rb'
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+ it do
11
+ normalized('http://www.linkedin.com/today/post/article/20130131131416-174077701-lessons-from-my-bosses?ref=email').should ==
12
+ 'http://www.linkedin.com/today/post/article/20130131131416-174077701-lessons-from-my-bosses'
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,27 @@
1
+ require File.expand_path('../../../lib/url_normalizer.rb', __FILE__)
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+
11
+ let(:base){'http://www.nytimes.com/2011/10/30/opinion/mona-simpsons-eulogy-for-steve-jobs.html'}
12
+
13
+ it { normalized(base + '?_r=1').should == base }
14
+ it { normalized(base + '?_r=1&utm_source=frank').should == base }
15
+
16
+ it do
17
+ normalized(base + '?pagewanted=all&_r=0').should ==
18
+ base + '?pagewanted=all'
19
+ end
20
+
21
+ it do
22
+ normalized(base + '?pagewanted=2&_r=1utm_source=buffer&buffer_share=b0b26').should ==
23
+ base + '?pagewanted=2'
24
+ end
25
+
26
+ end
27
+ end
@@ -0,0 +1,15 @@
1
+ require File.expand_path('../../../lib/url_normalizer.rb', __FILE__)
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+ it do
11
+ normalized('http://thinkprogress.org/politics/2012/08/17/705401/how-paul-ryans-budget-would-devastate-social-programs-for-todays-lower-income-americans/?mobile=nc').should ==
12
+ 'http://thinkprogress.org/politics/2012/08/17/705401/how-paul-ryans-budget-would-devastate-social-programs-for-todays-lower-income-americans/'
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,96 @@
1
+ require File.expand_path('../../lib/url_normalizer.rb', __FILE__)
2
+ require 'uri'
3
+ require 'base64'
4
+
5
+ describe UrlNormalizer do
6
+ describe ".normalize" do
7
+ def normalized url
8
+ UrlNormalizer.normalize(url)
9
+ end
10
+
11
+ it { normalized('http://www.google.com/foo').should == 'http://www.google.com/foo' }
12
+ it { normalized('http://www.google.com/foo#bar').should == 'http://www.google.com/foo' }
13
+
14
+ ['utm_source', 'utm_medium', 'utm_source', 'utm_content', 'utm_campaign'].each do |strip_param|
15
+ it "should strip #{strip_param} if it is the only parameter" do
16
+ normalized("http://www.google.com/foo?#{strip_param}=bar").should == 'http://www.google.com/foo'
17
+ end
18
+ it "should strip #{strip_param} if there are other parameters" do
19
+ normalized("http://www.google.com/foo?#{strip_param}=bar&x=y").should == 'http://www.google.com/foo?x=y'
20
+ end
21
+ end
22
+
23
+ it { normalized( 'http://www.google.com/?x=y|z').should == 'http://www.google.com/?x=y%7Cz' }
24
+ it { normalized( 'http://www.google.com/?x=y|z').should == 'http://www.google.com/?x=y%7Cz' }
25
+
26
+ it { normalized( 'http://www.example.org/file\bier.png').should == 'http://www.example.org/file%5Cbier.png' }
27
+
28
+ it do
29
+ normalized( 'http://www.ikea.com/nl/nl/catalog/products/30101154/?icid=nl|ic|hp_main|smarteasyliving|ikea365').should ==
30
+ 'http://www.ikea.com/nl/nl/catalog/products/30101154/?icid=nl%7Cic%7Chp_main%7Csmarteasyliving%7Cikea365'
31
+ end
32
+
33
+ it do
34
+ normalized( 'http://www.amazon.de/s/url=search-alias%3Daps').should ==
35
+ 'http://www.amazon.de/s/url=search-alias=aps'
36
+ end
37
+
38
+ it do
39
+ normalized( 'http://www.amazon.de/s/?url=search-alias%3Daps').should ==
40
+ 'http://www.amazon.de/s/?url=search-alias=aps'
41
+ end
42
+
43
+ it do
44
+ normalized( 'http://www.newyorker.com/online/blogs/borowitzreport/2013/08/amazon-founder-says-he-clicked-on-washington-post-by-mistake.html?mobify=0&utm_source=buffer&utm_campaign=Buffer&utm_content=buffer1526d&utm_medium=twitter').should ==
45
+ 'http://www.newyorker.com/online/blogs/borowitzreport/2013/08/amazon-founder-says-he-clicked-on-washington-post-by-mistake.html'
46
+ end
47
+
48
+ describe "improvements" do
49
+
50
+ it { normalized( 'http://www.google.com/a[b]').should == 'http://www.google.com/a%5Bb%5D' } # [ and ] are not allowed according to RFC 2732 http://www.ietf.org/rfc/rfc2732.txt
51
+ it { normalized( 'http://www.google.com/foo?bar=bax|zuup').should == 'http://www.google.com/foo?bar=bax%7Czuup' }
52
+
53
+ describe "normalizing proxy urls" do
54
+ it { normalized( "http://fct.li/parse?url=http%3A%2F%2Fwww.google.com&factlinkModus=default").should == "http://www.google.com/" }
55
+ end
56
+ describe "it should work on normal proxy urls" do
57
+ it { normalized( "http://fct.li/").should == "http://fct.li/" }
58
+ end
59
+ end
60
+ it {normalized( 'http://www.example.com/ff/entry.asp?123').should == 'http://www.example.com/ff/entry.asp?123'}
61
+
62
+
63
+ it {normalized( 'http://example.com/foo?x=^y').should == 'http://example.com/foo?x=%5Ey'}
64
+ it {normalized( 'http://example.com/foo?x=%y').should == 'http://example.com/foo?x=%25y'}
65
+ it {normalized( 'http://example.com/foo?x={y}').should == 'http://example.com/foo?x=%7By%7D'}
66
+ it {normalized( 'http://example.org/search.php?a=%F6').should == 'http://example.org/search.php?a=%F6'}
67
+ def decode_utf8_b64(string)
68
+ URI.unescape(string)
69
+ end
70
+ it "should not bug with invalid encodings" do
71
+ normalized( decode_utf8_b64("http%3A%2F%2Fwww.mercuryserver.com%2Fforums%2Fshowthread.php%3F100800-Stephan-Bodzin-amp-Marc-Romboy-%2596-Live-Luna-Live-Tour-(Harry-Klein-M%25FCnchen)-%2596-23-04-2")).should ==
72
+ "http://www.mercuryserver.com/forums/showthread.php?100800-Stephan-Bodzin-amp-Marc-Romboy-%96-Live-Luna-Live-Tour-(Harry-Klein-M%FCnchen)-%96-23-04-2"
73
+ end
74
+ describe 'xss protection' do
75
+ it "url encodes < and >" do
76
+ url = 'http://hoi/<>'
77
+ expect(normalized(url)).to eq 'http://hoi/%3C%3E'
78
+ end
79
+
80
+ it "url encodes \"" do
81
+ url = 'http://hoi/"'
82
+ expect(normalized(url)).to eq 'http://hoi/%22'
83
+ end
84
+
85
+ it "encodes all explicit spacing to a space" do
86
+ url = "http://hoi/%20a%09b%0Ac%0Dd"
87
+ expect(normalized(url)).to eq "http://hoi/%20a%09b%0Ac%0Dd"
88
+ end
89
+
90
+ it "leaves ' in the url (valid)" do
91
+ url = "http://hoi/'"
92
+ expect(normalized(url)).to eq "http://hoi/'"
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'url_normalizer/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "url_normalizer"
8
+ spec.version = UrlNormalizer::VERSION
9
+ spec.authors = ["Mark IJbema", "Martijn Russchen", "Tom de Vries"]
10
+ spec.email = ["markijbema+url_normalizer@gmail.com"]
11
+ spec.description = %q{Ruby gem to normalize urls}
12
+ spec.summary = %q{Ruby gem to normalize urls}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "andand"
22
+ spec.add_dependency "addressable"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.3"
25
+ spec.add_development_dependency "rake"
26
+ spec.add_development_dependency "rspec"
27
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_normalizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Mark IJbema
8
+ - Martijn Russchen
9
+ - Tom de Vries
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2013-09-29 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: andand
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - '>='
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: addressable
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :runtime
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: bundler
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ~>
48
+ - !ruby/object:Gem::Version
49
+ version: '1.3'
50
+ type: :development
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ~>
55
+ - !ruby/object:Gem::Version
56
+ version: '1.3'
57
+ - !ruby/object:Gem::Dependency
58
+ name: rake
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :development
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - '>='
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ - !ruby/object:Gem::Dependency
72
+ name: rspec
73
+ requirement: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ type: :development
79
+ prerelease: false
80
+ version_requirements: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ description: Ruby gem to normalize urls
86
+ email:
87
+ - markijbema+url_normalizer@gmail.com
88
+ executables: []
89
+ extensions: []
90
+ extra_rdoc_files: []
91
+ files:
92
+ - .gitignore
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - lib/url_normalizer.rb
98
+ - lib/url_normalizer/linkedin.rb
99
+ - lib/url_normalizer/new_york_times.rb
100
+ - lib/url_normalizer/newyorker.rb
101
+ - lib/url_normalizer/proxy.rb
102
+ - lib/url_normalizer/think_progress.rb
103
+ - lib/url_normalizer/version.rb
104
+ - spec/url_normalizer/linkedin_spec.rb
105
+ - spec/url_normalizer/new_york_times_spec.rb
106
+ - spec/url_normalizer/thinkprogress_spec.rb
107
+ - spec/url_normalizer_spec.rb
108
+ - url_normalizer.gemspec
109
+ homepage: ''
110
+ licenses:
111
+ - MIT
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - '>='
120
+ - !ruby/object:Gem::Version
121
+ version: '0'
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 2.1.3
130
+ signing_key:
131
+ specification_version: 4
132
+ summary: Ruby gem to normalize urls
133
+ test_files:
134
+ - spec/url_normalizer/linkedin_spec.rb
135
+ - spec/url_normalizer/new_york_times_spec.rb
136
+ - spec/url_normalizer/thinkprogress_spec.rb
137
+ - spec/url_normalizer_spec.rb
138
+ has_rdoc: