optic14n 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in optic14n.gemspec
4
+ gemspec
5
+
6
+ group :test do
7
+ gem 'rspec'
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Government Digital Service
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Optic14n
2
+
3
+ Canonicalises URLs.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'optic14n'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install optic14n
18
+
19
+ ## Usage
20
+
21
+ Parse a `BLURI` like this:
22
+
23
+ ```ruby
24
+ bluri = BLURI('http://somewhere.com/?a=1&b=2&c=3')
25
+ ```
26
+
27
+ Canonicalize it according to the [Previously-Established Rules](#the-previously-established-rules) thusly:
28
+
29
+ ```ruby
30
+ bluri.canonicalize!
31
+ ```
32
+
33
+ You can also do site-specific stuff if you know some of the querystring will be valuable
34
+ ```ruby
35
+ bluri.canonicalize!(allow_query: :all)
36
+ ```
37
+
38
+ ```ruby
39
+ bluri.canonicalize!(allow_query: [:a, :c])
40
+ # or
41
+ bluri.canonicalize!(allow_query: ['a', 'c'])
42
+ ```
43
+
44
+ ### The previously-established rules
45
+
46
+ This is a gem for canonicalising HTTP URIs such that we can boil our input set of URIs down to something that is much
47
+ smaller than it would otherwise be. We do this aggressively by:
48
+
49
+ * lowercasing URIs
50
+ * removing query strings (unless told otherwise)
51
+ * removing fragments
52
+ * escaping and unescaping various characters and escape sequences according to RFC3986
53
+
54
+ ## Contributing
55
+
56
+ 1. Fork it
57
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
58
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
59
+ 4. Push to the branch (`git push origin my-new-feature`)
60
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+ require 'optic14n'
6
+ Dir.glob('lib/tasks/*.rake').each { |r| import r }
7
+
8
+
9
+ require 'gem_publisher'
10
+ desc 'Publish gem to Rubygems'
11
+ task :publish_gem do
12
+ gem = GemPublisher.publish_if_updated('optic14n.gemspec', :rubygems)
13
+ puts "Published #{gem}" if gem
14
+ end
15
+
16
+ RSpec::Core::RakeTask.new(:spec)
17
+
18
+ task default: :spec
19
+ task test: :spec
data/jenkins.sh ADDED
@@ -0,0 +1,10 @@
1
+ #!/bin/bash -x
2
+ export RAILS_ENV=test
3
+ export DISPLAY=":99"
4
+
5
+ set -e
6
+ rm -f Gemfile.lock
7
+ bundle install --path "${HOME}/bundles/${JOB_NAME}"
8
+ export GOVUK_APP_DOMAIN=dev.gov.uk
9
+ bundle exec rake
10
+ bundle exec rake publish_gem
@@ -0,0 +1,45 @@
1
+ module Optic14n
2
+ ##
3
+ # Canonicalizes a set of URLs
4
+ class CanonicalizedUrls
5
+ attr_reader :output_set, :seen, :failures, :each
6
+
7
+ extend Forwardable
8
+
9
+ def_delegators :@output_set, :size
10
+
11
+ def initialize(urls, options)
12
+ @urls = urls
13
+ @options = options
14
+ end
15
+
16
+ def canonicalize!
17
+ @seen = 0
18
+ @failures = {}
19
+ @output_set = Set.new
20
+
21
+ @urls.each do |url|
22
+ begin
23
+ @output_set.add(BLURI(url).canonicalize!(@options))
24
+ rescue Exception => e
25
+ failures[url] = e
26
+ end
27
+ @seen += 1
28
+ end
29
+ end
30
+
31
+ def write(filename)
32
+ File.open(filename, 'w') do |file|
33
+ @output_set.each do |url|
34
+ file.puts url
35
+ end
36
+ end
37
+ end
38
+
39
+ ##
40
+ # Canonicalize given urls. +options+ will be passed to +BLURI.parse+
41
+ def self.from_urls(urls, options = {})
42
+ CanonicalizedUrls.new(urls, options).tap { |c| c.canonicalize! }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module Optic14n
2
+ VERSION = '2.0.0'
3
+ end
data/lib/optic14n.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'optic14n/version'
2
+
3
+ require 'uri'
4
+ require 'addressable/uri'
5
+ require 'cgi'
6
+ require 'forwardable'
7
+ require 'uri/query_hash'
8
+ require 'uri/bluri'
9
+
10
+ require 'optic14n/canonicalized_urls'
@@ -0,0 +1,15 @@
1
+ require 'set'
2
+
3
+ namespace :opt do
4
+ desc 'Measure reduction from canonicalisation'
5
+ task :measure, [:filename, :output_file] do |_, args|
6
+ filename = args[:filename]
7
+ output_file = args[:output_file]
8
+
9
+ Optic14n::CanonicalizedUrls.from_urls(File.read(filename).each_line).tap do |urls|
10
+ urls.write(output_file) if output_file
11
+
12
+ puts "#{urls.seen} urls seen, #{urls.size} after canonicalisation"
13
+ end
14
+ end
15
+ end
data/lib/uri/bluri.rb ADDED
@@ -0,0 +1,116 @@
1
+ # encoding: utf-8
2
+
3
+ module URI
4
+ ##
5
+ # A URI class with a bit extra for canonicalising query strings
6
+ #
7
+ class BLURI < URI::HTTP
8
+ PATH_ESCAPE_MAPPINGS = {
9
+ '[' => '%5b',
10
+ ']' => '%5d',
11
+ ',' => '%2c',
12
+ '"' => '%22',
13
+ "'" => '%27',
14
+ '|' => '%7c',
15
+ '!' => '%21',
16
+ '£' => '%c2%a3'
17
+ }
18
+
19
+ PATH_UNESCAPE_MAPPINGS = {
20
+ '%7e' => '~',
21
+ '%21' => '!'
22
+ }
23
+
24
+ REQUIRE_REGEX_ESCAPE = %w<. | ( ) [ ] { } + \ ^ $ * ?> & PATH_ESCAPE_MAPPINGS.keys
25
+
26
+ extend Forwardable
27
+
28
+ def_delegators :@uri, :scheme, :path, :host, :host=, :query, :fragment, :to_s
29
+
30
+ def initialize(uri_str)
31
+ @uri = ::Addressable::URI.parse(uri_str)
32
+ raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri
33
+ end
34
+
35
+ def query_hash
36
+ @query_hash ||= CGI::parse(self.query || '').tap do |query_hash|
37
+ # By default, CGI::parse produces lots of arrays. Usually they have a single element
38
+ # in them. That's correct but not terribly usable. Fix it here.
39
+ query_hash.each_pair { |k, v| query_hash[k] = v[0] if v.length == 1 }
40
+ query_hash.extend QueryHash
41
+ end
42
+ end
43
+
44
+ def query_hash=(value)
45
+ @query_hash = value
46
+ @uri.query = @query_hash.to_s == '' ? nil : @query_hash.to_s
47
+ end
48
+
49
+ def query=(query_str)
50
+ @query_hash = nil
51
+ @uri.query = query_str == '' ? nil : query_str
52
+ end
53
+
54
+ def self.parse(uri_str)
55
+ # Deal with known URI spec breaks - leading/trailing spaces and unencoded entities
56
+ if uri_str.is_a? String
57
+ uri_str = uri_str.strip.downcase.gsub(' ', '%20')
58
+ uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/
59
+ end
60
+ BLURI.new(uri_str)
61
+ end
62
+
63
+ def has_query?
64
+ %w(http https).include?(@uri.scheme) && query
65
+ end
66
+
67
+ def canonicalize!(options = {})
68
+ @uri.scheme = 'http' if @uri.scheme == 'https'
69
+
70
+ @uri.path = @uri.path.sub(/\/*$/, '') if @uri.path =~ /^*\/$/
71
+ @uri.path.gsub!(BLURI.path_escape_char_regex, PATH_ESCAPE_MAPPINGS)
72
+ @uri.path.gsub!(BLURI.path_unescape_code_regex, PATH_UNESCAPE_MAPPINGS)
73
+
74
+ canonicalize_query!(options)
75
+
76
+ @uri.fragment = nil
77
+ self
78
+ end
79
+
80
+ def canonicalize_query!(options)
81
+ allow_all = (options[:allow_query] == :all)
82
+ allowed_keys = [options[:allow_query]].flatten.compact unless allow_all
83
+
84
+ query_hash.keep_if do |k, _|
85
+ allow_all || (allowed_keys.include?(k) || allowed_keys.include?(k.to_sym))
86
+ end
87
+
88
+ self.query_hash = QueryHash[query_hash.sort_by { |k, _| k }]
89
+ end
90
+
91
+ ##
92
+ # Generate a regex which matches all characters in PATH_ESCAPE_MAPPINGS
93
+ def self.path_escape_char_regex
94
+ @path_escape_char_regex ||=
95
+ Regexp.new('[' + PATH_ESCAPE_MAPPINGS.keys.map do |char|
96
+ REQUIRE_REGEX_ESCAPE.include?(char) ? "\\#{char}" : char
97
+ end.join + ']')
98
+ end
99
+
100
+ ##
101
+ # Generate a regex which matches all escape sequences in PATH_UNESCAPE_MAPPINGS
102
+ def self.path_unescape_code_regex
103
+ @path_unescape_code_regex ||= Regexp.new(
104
+ PATH_UNESCAPE_MAPPINGS.keys.map { |code| "(?:#{code})" }.join('|')
105
+ )
106
+ end
107
+ end
108
+ end
109
+
110
+ module Kernel
111
+ def BLURI(uri_str)
112
+ ::URI::BLURI.parse(uri_str)
113
+ end
114
+
115
+ module_function :BLURI
116
+ end
@@ -0,0 +1,33 @@
1
+ module URI
2
+ ##
3
+ # Extends a hash with query string rendering/semi-indifferent access
4
+ module QueryHash
5
+ def [](key)
6
+ item = super key
7
+ item = super(key.to_s) if item.nil? || item.length == 0
8
+ item.class == Array && item.length == 0 ? nil : item
9
+ end
10
+
11
+ def to_s
12
+ keys.map { |key| render_value(key, self[key]) }.join('&')
13
+ end
14
+
15
+ ##
16
+ # Creates a new hash populated with the given objects.
17
+ def self.[](value)
18
+ Hash[value].tap do |hash|
19
+ hash.extend(QueryHash)
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def render_value(key, value)
26
+ case value
27
+ when nil then key
28
+ when Array then value.map { |el| render_value(key, el) }.join('&')
29
+ else URI.encode_www_form_component(key) << '=' << URI.encode_www_form_component(value)
30
+ end
31
+ end
32
+ end
33
+ end
data/optic14n.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'optic14n/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'optic14n'
8
+ spec.version = Optic14n::VERSION
9
+ spec.authors = ['Russell Garner']
10
+ spec.email = %w(rgarner@zephyros-systems.co.uk)
11
+ spec.description = %q{Canonicalises URLs.}
12
+ spec.summary = %q{Specifically, HTTP URLs, for a limited purpose}
13
+ spec.homepage = ''
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = %w(lib)
20
+
21
+ spec.add_dependency 'addressable', '~> 2.3'
22
+
23
+ spec.add_development_dependency 'rake'
24
+ spec.add_development_dependency 'gem_publisher', '~> 1.3.0'
25
+ end
@@ -0,0 +1,88 @@
1
+ require 'spec_helper'
2
+
3
+ describe URI::BLURI do
4
+ it 'should be an HTTP URI' do
5
+ bluri = BLURI('http://some.where.com')
6
+ bluri.should be_a URI::HTTP
7
+ end
8
+
9
+ it 'should not allow other schemes' do
10
+ lambda { BLURI('ftp://foo').should raise_error(ArgumentError) }
11
+ end
12
+
13
+ it 'should not allow nil' do
14
+ lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError)
15
+ end
16
+
17
+ it 'supports scheme' do
18
+ BLURI('http://foo').scheme.should == 'http'
19
+ end
20
+ it 'supports host' do
21
+ BLURI('http://foo').host.should == 'foo'
22
+ end
23
+ it 'supports path' do
24
+ BLURI('http://foo/a/path').path.should == '/a/path'
25
+ end
26
+ it 'supports query' do
27
+ BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo'
28
+ end
29
+ it 'supports fragment' do
30
+ BLURI('http://foo#fragment').fragment.should == 'fragment'
31
+ end
32
+ it 'supports mailto:someone@somewhere' do
33
+ BLURI('mailto:me@there.com').to_s.should == 'mailto:me@there.com'
34
+ end
35
+ it 'corrects unencoded ampersands ins mailto' do # http://www.faqs.org/rfcs/rfc2368.html
36
+ BLURI('mailto:fruit&veg.newcastle@rpa.gsi.gov.uk').to_s.should == 'mailto:fruit%26veg.newcastle@rpa.gsi.gov.uk'
37
+ end
38
+ it 'corrects trailing spaces' do
39
+ BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk'
40
+ end
41
+ it 'corrects leading spaces' do
42
+ BLURI(' http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk'
43
+ end
44
+
45
+ describe 'Query string parsing' do
46
+ context 'the query string is of HTML-encoded form k=v&q=p' do
47
+ before do
48
+ @bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE')
49
+ end
50
+
51
+ it 'indexes the query string' do
52
+ @bluri.query_hash['itemid'].should == '1'
53
+ end
54
+
55
+ it 'allows indexing by symbol' do
56
+ @bluri.query_hash[:itemid].should == '1'
57
+ end
58
+
59
+ it 'shows nil for absent items' do
60
+ @bluri.query_hash[:eerie_flash].should == nil
61
+ end
62
+
63
+ it 'indexes the second query string item' do
64
+ @bluri.query_hash['type'].should == 'resource'
65
+ end
66
+
67
+ it 'allows setting of the query' do
68
+ @bluri.query = 'furry=really'
69
+ @bluri.to_s.should == 'http://some.com/a/path?furry=really'
70
+ end
71
+ end
72
+
73
+ context 'the querystring is not an HTML-encoded thing' do
74
+ before do
75
+ @bluri = BLURI('http://some.com/a/path?foo&bar')
76
+ end
77
+
78
+ it 'retains the query string' do
79
+ @bluri.query.should == 'foo&bar'
80
+ end
81
+
82
+ it 'has a query hash with empty elements' do
83
+ @bluri.query_hash['foo'].should == nil
84
+ @bluri.query_hash['foo'].should == nil
85
+ end
86
+ end
87
+ end
88
+ end
data/spec/c14n.t ADDED
@@ -0,0 +1,71 @@
1
+ # Here for reference, see original at
2
+ # https://github.com/alphagov/redirector/blob/master/tests/lib/c14n.t
3
+
4
+ use strict;
5
+ use Test::More;
6
+ require 'lib/c14n.pl';
7
+
8
+ #
9
+ # case
10
+ #
11
+ is(c14n_url("http://www.EXAMPLE.COM/Foo/Bar/BAZ"), "http://www.example.com/foo/bar/baz", "c14n URL is lower-case");
12
+
13
+ #
14
+ # protocol
15
+ #
16
+ is(c14n_url("https://www.example.com"), "http://www.example.com", "translates protocol to http");
17
+
18
+ #
19
+ # slashes
20
+ #
21
+ is(c14n_url("http://www.example.com/"), "http://www.example.com", "drops trailing slash");
22
+ is(c14n_url("http://www.example.com////"), "http://www.example.com", "drops multiple trailing slashes");
23
+
24
+ #
25
+ # fragment identifier
26
+ #
27
+ is(c14n_url("http://www.example.com#foo"), "http://www.example.com", "drops fragment identifier");
28
+ is(c14n_url("http://www.example.com/#foo"), "http://www.example.com", "drops fragment identifier and slashes");
29
+
30
+ #
31
+ # encoding
32
+ #
33
+ is(c14n_url("http://www.example.com/:colon:"), "http://www.example.com/:colon:", "colons");
34
+ is(c14n_url("http://www.example.com/~tide"), "http://www.example.com/~tide", "tide");
35
+ is(c14n_url("http://www.example.com/_underscore_"), "http://www.example.com/_underscore_", "underscore");
36
+ is(c14n_url("http://www.example.com/*asterisk*"), "http://www.example.com/*asterisk*", "asterisk");
37
+ is(c14n_url("http://www.example.com/(parens)"), "http://www.example.com/(parens)", "parens");
38
+ is(c14n_url("http://www.example.com/[square-brackets]"), "http://www.example.com/%5bsquare-brackets%5d", "square-brackets");
39
+
40
+ is(c14n_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27', "commas and quotes");
41
+ is(c14n_url("http://www.example.com/problematic-in-curl[]||[and-regexes]"), "http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d", "square brackets and pipes");
42
+ is(c14n_url("http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21"),
43
+ 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!',
44
+ "non-reserved character percent decoding");
45
+
46
+ is(c14n_url("https://www.example.com/pound-sign-£"), "http://www.example.com/pound-sign-%c2%a3", "pound sign");
47
+
48
+ #
49
+ # query_strings
50
+ #
51
+ is(c14n_url("http://www.example.com?q=foo"), "http://www.example.com", "drops disallowed query-string");
52
+ is(c14n_url("http://www.example.com/?q=foo"), "http://www.example.com", "drops disallowed query-string after slash");
53
+ is(c14n_url("http://www.example.com/?q=foo#bar"), "http://www.example.com", "drops disallowed query-string after a slash with fragid");
54
+
55
+ is(c14n_url("http://www.example.com?a=1&c=3&b=2", '*'), "http://www.example.com?a=1&b=2&c=3", "query string wildcard value");
56
+
57
+ is(c14n_url("http://www.example.com/?q=foo", "q"), "http://www.example.com?q=foo", "allow named query_string parameter");
58
+
59
+ is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "sorts query_string values");
60
+ is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", " b e,c:d, a "), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "accept colon and space separated allowed values");
61
+ is(c14n_url("http://www.example.com?c=23;d=1;b=909;e=33;a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "converts matrix URI to query_string");
62
+
63
+ is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "topic,item"), "http://www.example.com?item=23444&topic=334499", "allows cherry-picked query_string");
64
+ is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "foo,bar,baz"), "http://www.example.com", "no ? for empty query_string values");
65
+
66
+ is(c14n_url("http://www.example.com?a=you're_dangerous", '*'), "http://www.example.com?a=you%27re_dangerous", "escape query string values");
67
+
68
+ #
69
+ # normalise url
70
+ #
71
+ is(normalise_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-CSV-harder-to-%27awk%27', "commas and quotes");
data/spec/c14n_spec.rb ADDED
@@ -0,0 +1,163 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe "Paul's tests, translated from Perl" do
6
+ it 'lowercases URLs' do
7
+ BLURI('http://www.EXAMPLE.COM/Foo/Bar/BAZ').canonicalize!.to_s.should == 'http://www.example.com/foo/bar/baz'
8
+ end
9
+
10
+ describe 'protocol' do
11
+ it 'translates protocol to http', reason: 'Reduces our input space, everything public anyway' do
12
+ BLURI('https://www.example.com').canonicalize!.to_s.should == 'http://www.example.com'
13
+ end
14
+ end
15
+
16
+ describe 'slashes' do
17
+ it 'drops single trailing slashes' do
18
+ BLURI('http://www.example.com/').canonicalize!.to_s.should == 'http://www.example.com'
19
+ end
20
+
21
+ it 'drops multiple trailing slashes' do
22
+ BLURI('http://www.example.com////').canonicalize!.to_s.should == 'http://www.example.com'
23
+ end
24
+
25
+ it 'drops multiple trailing slashes on the path' do
26
+ BLURI('http://www.example.com/foo///').canonicalize!.to_s.should == 'http://www.example.com/foo'
27
+ end
28
+ end
29
+
30
+ describe 'fragments' do
31
+ it 'drops fragment identifier', reason: 'They won''t be mapped, so are redundant' do
32
+ BLURI('http://www.example.com#foo').canonicalize!.to_s.should == 'http://www.example.com'
33
+ end
34
+ it 'drops fragment identifier and slashes' do
35
+ BLURI('http://www.example.com/#foo').canonicalize!.to_s.should == 'http://www.example.com'
36
+ end
37
+ end
38
+
39
+ describe 'Things to keep verbatim or encode', reason: 'http://tools.ietf.org/html/rfc3986' do
40
+ it 'retains colons' do
41
+ BLURI('http://www.example.com/:colon:').canonicalize!.to_s.should == 'http://www.example.com/:colon:'
42
+ end
43
+ it 'retains tilde' do
44
+ BLURI('http://www.example.com/~tilde').canonicalize!.to_s.should == 'http://www.example.com/~tilde'
45
+ end
46
+ it 'retains underscores' do
47
+ BLURI('http://www.example.com/_underscore_').canonicalize!.to_s.should == 'http://www.example.com/_underscore_'
48
+ end
49
+ it 'retains asterisks' do
50
+ BLURI('http://www.example.com/*asterisk*').canonicalize!.to_s.should == 'http://www.example.com/*asterisk*'
51
+ end
52
+ it 'retains parens' do
53
+ BLURI('http://www.example.com/(parens)').canonicalize!.to_s.should == 'http://www.example.com/(parens)'
54
+ end
55
+ it 'escapes square brackets' do
56
+ BLURI('http://www.example.com/[square-brackets]').canonicalize!.to_s.should == 'http://www.example.com/%5bsquare-brackets%5d'
57
+ end
58
+ it 'encodes commas and quotes', reason: 'They make csv harder to awk' do
59
+ BLURI("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'").canonicalize!.to_s.should ==
60
+ 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27'
61
+ end
62
+ it 'encodes square brackets and pipes', reason: "It's problematic in curl and regexes" do
63
+ BLURI('http://www.example.com/problematic-in-curl[]||[and-regexes]').canonicalize!.to_s.should ==
64
+ 'http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d'
65
+ end
66
+ it 'decodes non-reserved characters (! and ~)' do
67
+ # My god, it's full of stars
68
+ BLURI('http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21').
69
+ canonicalize!.to_s.should == 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!'
70
+ end
71
+ it 'encodes pound signs' do
72
+ BLURI('https://www.example.com/pound-sign-£').canonicalize!.to_s.should == 'http://www.example.com/pound-sign-%c2%a3'
73
+ end
74
+ end
75
+
76
+ describe 'query strings' do
77
+ it 'disallows all query string params by default' do
78
+ BLURI('http://www.example.com?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
79
+ end
80
+ it 'disallows all params when there''s a slash' do
81
+ BLURI('http://www.example.com/?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
82
+ end
83
+ it 'disallows all params after a slash with fragid' do
84
+ BLURI('http://www.example.com/?q=foo#bar').canonicalize!.to_s.should == 'http://www.example.com'
85
+ end
86
+
87
+ describe 'allowing some or all query string values' do
88
+ it 'allows named query_string parameters' do
89
+ BLURI('http://www.example.com/?q=foo&r=bar').canonicalize!(allow_query: 'q').to_s.should ==
90
+ 'http://www.example.com?q=foo'
91
+ end
92
+ it 'sorts query string values' do
93
+ BLURI('http://www.example.com?c=23&d=1&b=909&e=33&a=1').
94
+ canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
95
+ end
96
+ it 'encodes querystring values' do
97
+ BLURI("http://www.example.com?a=you're_dangerous").canonicalize!(allow_query: :all).to_s.should ==
98
+ 'http://www.example.com?a=you%27re_dangerous'
99
+ end
100
+ it 'whitelists and sorts query strings' do
101
+ BLURI('http://www.example.com?a=1&c=3&b=2').canonicalize!(allow_query: :all).to_s.should ==
102
+ 'http://www.example.com?a=1&b=2&c=3'
103
+ end
104
+ it 'converts matrix URI to query_string' do
105
+ BLURI('http://www.example.com?c=23;d=1;b=909;e=33;a=1').
106
+ canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
107
+ end
108
+ it 'sorts cherry-picked query string arguments' do
109
+ BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
110
+ canonicalize!(allow_query: [:topic, :item]).to_s.should == 'http://www.example.com?item=23444&topic=334499'
111
+ end
112
+ it 'ignores empty querystring values' do
113
+ BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
114
+ canonicalize!(allow_query: %w(foo bar baz)).to_s.should == 'http://www.example.com'
115
+ end
116
+
117
+ describe 'querystrings that are not an HTML-encoded thing' do
118
+ before do
119
+ @bluri = BLURI('http://some.com/a/path?foo&bar').canonicalize!(allow_query: :all)
120
+ end
121
+
122
+ it 'retains the query string' do
123
+ @bluri.query.should == 'bar&foo'
124
+ end
125
+
126
+ it 'has a query hash with empty elements' do
127
+ @bluri.query_hash['foo'].should == nil
128
+ @bluri.query_hash['bar'].should == nil
129
+ end
130
+
131
+ it 'renders the string properly' do
132
+ @bluri.query_hash.to_s.should == 'bar&foo'
133
+ end
134
+ end
135
+ end
136
+
137
+ describe 'degenerate cases' do
138
+ describe 'the treatment of query strings when there are query string octets that unescape to '\
139
+ 'invalid UTF-8 sequences (we no longer treat these as failures)' do
140
+ it 'no longer raises exceptions when there are bad things in query values' do
141
+ BLURI('http://example.com/path?view=%ED').
142
+ canonicalize!(allow_query: :all).
143
+ to_s.should eql('http://example.com/path?view=%ED')
144
+ end
145
+
146
+ it 're-encodes correctly when there are bad things in query keys' do
147
+ BLURI('http://example.com/path?%ED=view').
148
+ canonicalize!(allow_query: :all).
149
+ to_s.should eql('http://example.com/path?%ED=view')
150
+ end
151
+ end
152
+
153
+ describe 'failure to canonicalize paths correctly' do
154
+ # see https://www.pivotaltracker.com/s/projects/860575/stories/54502932
155
+
156
+ subject { BLURI('http://www.voa.gov.uk/stuff/?query=thing').canonicalize!(allow_query: :all) }
157
+
158
+ its(:path) { should eql('/stuff') }
159
+ its(:query) { should eql('query=thing') }
160
+ end
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,59 @@
1
+ require 'spec_helper'
2
+
3
+ describe Optic14n::CanonicalizedUrls do
4
+ describe 'c14nize' do
5
+ let(:test_urls) do
6
+ %w(
7
+ http://www.qhm.mod.uk/portsmouth/leisure/fuel
8
+ http://www.qhm.mod.uk/portsmouth/leisure/lntm?
9
+ http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view
10
+ http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view&id=199
11
+ http://unistats.direct.gov.uk/searchResults.do?pname=institutesearchresults&level3Subjects=L3.90%AC10007761%ACFIRSTDEGREE%ACFulltime%AC430%ACNo%AC60%ACYes%AC83%ACNo%ACYes
12
+ 1234://123
13
+ )
14
+ end
15
+
16
+ context 'options[:allow_query] is false' do
17
+ subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: false) }
18
+
19
+ it { should be_a(Optic14n::CanonicalizedUrls) }
20
+
21
+ its(:seen) { should eql(6) }
22
+
23
+ describe 'the output set' do
24
+ subject(:output_set) { c14nizer.output_set }
25
+
26
+ its(:size) { should eql(3) }
27
+
28
+ describe 'the items' do
29
+ subject { output_set.map(&:to_s) }
30
+
31
+ it { should include('http://www.qhm.mod.uk/portsmouth/leisure/fuel') }
32
+ it { should include('http://www.qhm.mod.uk/portsmouth/leisure/lntm') }
33
+ it { should include('http://unistats.direct.gov.uk/searchresults.do') }
34
+ end
35
+ end
36
+ end
37
+
38
+ context 'options[:allow_query] is :all' do
39
+ subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: :all) }
40
+
41
+ describe 'the output set' do
42
+ subject(:output_set) { c14nizer.output_set }
43
+
44
+ its(:size) { should eql(5) }
45
+ end
46
+
47
+ describe 'failures' do
48
+ subject(:failures) { c14nizer.failures }
49
+
50
+ it { should be_a(Hash) }
51
+
52
+ it 'has our last URL and an error' do
53
+ e = failures[test_urls.last]
54
+ e.should be_an(Addressable::URI::InvalidURIError)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe URI::QueryHash do
4
+ subject(:hash) { {}.extend URI::QueryHash }
5
+
6
+ its(:to_s) { should eql('') }
7
+
8
+ describe 'setting a value by symbol' do
9
+ before { hash['x'] = '1' }
10
+
11
+ its([:x]) { should eql('1') }
12
+ its(['x']) { should eql('1') }
13
+ its(:to_s) { should eql('x=1') }
14
+ end
15
+ end
@@ -0,0 +1 @@
1
+ require 'optic14n'
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ describe URI::QueryHash do
4
+ describe 'non-HTML encoded query strings' do
5
+ subject { { 'foo' => nil, 'bar' => nil }.extend URI::QueryHash }
6
+
7
+ its(['foo']) { should be_nil }
8
+ its(['bar']) { should be_nil }
9
+ its(:to_s) { should eql('foo&bar') }
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: optic14n
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Russell Garner
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: addressable
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.3'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: gem_publisher
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 1.3.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.3.0
62
+ description: Canonicalises URLs.
63
+ email:
64
+ - rgarner@zephyros-systems.co.uk
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - LICENSE.txt
72
+ - README.md
73
+ - Rakefile
74
+ - jenkins.sh
75
+ - lib/optic14n.rb
76
+ - lib/optic14n/canonicalized_urls.rb
77
+ - lib/optic14n/version.rb
78
+ - lib/tasks/measure_reduction.rake
79
+ - lib/uri/bluri.rb
80
+ - lib/uri/query_hash.rb
81
+ - optic14n.gemspec
82
+ - spec/bluri_spec.rb
83
+ - spec/c14n.t
84
+ - spec/c14n_spec.rb
85
+ - spec/canonicalized_urls_spec.rb
86
+ - spec/query_hash_spec.rb
87
+ - spec/spec_helper.rb
88
+ - spec/uri/query_hash_spec.rb
89
+ homepage: ''
90
+ licenses:
91
+ - MIT
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ segments:
103
+ - 0
104
+ hash: 2602697415991458495
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ segments:
112
+ - 0
113
+ hash: 2602697415991458495
114
+ requirements: []
115
+ rubyforge_project:
116
+ rubygems_version: 1.8.23
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Specifically, HTTP URLs, for a limited purpose
120
+ test_files:
121
+ - spec/bluri_spec.rb
122
+ - spec/c14n.t
123
+ - spec/c14n_spec.rb
124
+ - spec/canonicalized_urls_spec.rb
125
+ - spec/query_hash_spec.rb
126
+ - spec/spec_helper.rb
127
+ - spec/uri/query_hash_spec.rb