optic14n 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in optic14n.gemspec
4
+ gemspec
5
+
6
+ group :test do
7
+ gem 'rspec'
8
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Government Digital Service
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,60 @@
1
+ # Optic14n
2
+
3
+ Canonicalises URLs.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'optic14n'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install optic14n
18
+
19
+ ## Usage
20
+
21
+ Parse a `BLURI` like this:
22
+
23
+ ```ruby
24
+ bluri = BLURI('http://somewhere.com/?a=1&b=2&c=3')
25
+ ```
26
+
27
+ Canonicalize it according to the [Previously-Established Rules](#the-previously-established-rules) thusly:
28
+
29
+ ```ruby
30
+ bluri.canonicalize!
31
+ ```
32
+
33
+ You can also do site-specific stuff if you know some of the querystring will be valuable
34
+ ```ruby
35
+ bluri.canonicalize!(allow_query: :all)
36
+ ```
37
+
38
+ ```ruby
39
+ bluri.canonicalize!(allow_query: [:a, :c])
40
+ # or
41
+ bluri.canonicalize!(allow_query: ['a', 'c'])
42
+ ```
43
+
44
+ ### The previously-established rules
45
+
46
+ This is a gem for canonicalising HTTP URIs such that we can boil our input set of URIs down to something that is much
47
+ smaller than it would otherwise be. We do this aggressively by:
48
+
49
+ * lowercasing URIs
50
+ * removing query strings (unless told otherwise)
51
+ * removing fragments
52
+ * escaping and unescaping various characters and escape sequences according to RFC3986
53
+
54
+ ## Contributing
55
+
56
+ 1. Fork it
57
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
58
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
59
+ 4. Push to the branch (`git push origin my-new-feature`)
60
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+ require 'optic14n'
6
+ Dir.glob('lib/tasks/*.rake').each { |r| import r }
7
+
8
+
9
+ require 'gem_publisher'
10
+ desc 'Publish gem to Rubygems'
11
+ task :publish_gem do
12
+ gem = GemPublisher.publish_if_updated('optic14n.gemspec', :rubygems)
13
+ puts "Published #{gem}" if gem
14
+ end
15
+
16
+ RSpec::Core::RakeTask.new(:spec)
17
+
18
+ task default: :spec
19
+ task test: :spec
data/jenkins.sh ADDED
@@ -0,0 +1,10 @@
1
+ #!/bin/bash -x
2
+ export RAILS_ENV=test
3
+ export DISPLAY=":99"
4
+
5
+ set -e
6
+ rm -f Gemfile.lock
7
+ bundle install --path "${HOME}/bundles/${JOB_NAME}"
8
+ export GOVUK_APP_DOMAIN=dev.gov.uk
9
+ bundle exec rake
10
+ bundle exec rake publish_gem
@@ -0,0 +1,45 @@
1
+ module Optic14n
2
+ ##
3
+ # Canonicalizes a set of URLs
4
+ class CanonicalizedUrls
5
+ attr_reader :output_set, :seen, :failures, :each
6
+
7
+ extend Forwardable
8
+
9
+ def_delegators :@output_set, :size
10
+
11
+ def initialize(urls, options)
12
+ @urls = urls
13
+ @options = options
14
+ end
15
+
16
+ def canonicalize!
17
+ @seen = 0
18
+ @failures = {}
19
+ @output_set = Set.new
20
+
21
+ @urls.each do |url|
22
+ begin
23
+ @output_set.add(BLURI(url).canonicalize!(@options))
24
+ rescue Exception => e
25
+ failures[url] = e
26
+ end
27
+ @seen += 1
28
+ end
29
+ end
30
+
31
+ def write(filename)
32
+ File.open(filename, 'w') do |file|
33
+ @output_set.each do |url|
34
+ file.puts url
35
+ end
36
+ end
37
+ end
38
+
39
+ ##
40
+ # Canonicalize given urls. +options+ will be passed to +BLURI.parse+
41
+ def self.from_urls(urls, options = {})
42
+ CanonicalizedUrls.new(urls, options).tap { |c| c.canonicalize! }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,3 @@
1
+ module Optic14n
2
+ VERSION = '2.0.0'
3
+ end
data/lib/optic14n.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'optic14n/version'
2
+
3
+ require 'uri'
4
+ require 'addressable/uri'
5
+ require 'cgi'
6
+ require 'forwardable'
7
+ require 'uri/query_hash'
8
+ require 'uri/bluri'
9
+
10
+ require 'optic14n/canonicalized_urls'
@@ -0,0 +1,15 @@
1
+ require 'set'
2
+
3
+ namespace :opt do
4
+ desc 'Measure reduction from canonicalisation'
5
+ task :measure, [:filename, :output_file] do |_, args|
6
+ filename = args[:filename]
7
+ output_file = args[:output_file]
8
+
9
+ Optic14n::CanonicalizedUrls.from_urls(File.read(filename).each_line).tap do |urls|
10
+ urls.write(output_file) if output_file
11
+
12
+ puts "#{urls.seen} urls seen, #{urls.size} after canonicalisation"
13
+ end
14
+ end
15
+ end
data/lib/uri/bluri.rb ADDED
@@ -0,0 +1,116 @@
1
+ # encoding: utf-8
2
+
3
+ module URI
4
+ ##
5
+ # A URI class with a bit extra for canonicalising query strings
6
+ #
7
+ class BLURI < URI::HTTP
8
+ PATH_ESCAPE_MAPPINGS = {
9
+ '[' => '%5b',
10
+ ']' => '%5d',
11
+ ',' => '%2c',
12
+ '"' => '%22',
13
+ "'" => '%27',
14
+ '|' => '%7c',
15
+ '!' => '%21',
16
+ '£' => '%c2%a3'
17
+ }
18
+
19
+ PATH_UNESCAPE_MAPPINGS = {
20
+ '%7e' => '~',
21
+ '%21' => '!'
22
+ }
23
+
24
+ REQUIRE_REGEX_ESCAPE = %w<. | ( ) [ ] { } + \ ^ $ * ?> & PATH_ESCAPE_MAPPINGS.keys
25
+
26
+ extend Forwardable
27
+
28
+ def_delegators :@uri, :scheme, :path, :host, :host=, :query, :fragment, :to_s
29
+
30
+ def initialize(uri_str)
31
+ @uri = ::Addressable::URI.parse(uri_str)
32
+ raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri
33
+ end
34
+
35
+ def query_hash
36
+ @query_hash ||= CGI::parse(self.query || '').tap do |query_hash|
37
+ # By default, CGI::parse produces lots of arrays. Usually they have a single element
38
+ # in them. That's correct but not terribly usable. Fix it here.
39
+ query_hash.each_pair { |k, v| query_hash[k] = v[0] if v.length == 1 }
40
+ query_hash.extend QueryHash
41
+ end
42
+ end
43
+
44
+ def query_hash=(value)
45
+ @query_hash = value
46
+ @uri.query = @query_hash.to_s == '' ? nil : @query_hash.to_s
47
+ end
48
+
49
+ def query=(query_str)
50
+ @query_hash = nil
51
+ @uri.query = query_str == '' ? nil : query_str
52
+ end
53
+
54
+ def self.parse(uri_str)
55
+ # Deal with known URI spec breaks - leading/trailing spaces and unencoded entities
56
+ if uri_str.is_a? String
57
+ uri_str = uri_str.strip.downcase.gsub(' ', '%20')
58
+ uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/
59
+ end
60
+ BLURI.new(uri_str)
61
+ end
62
+
63
+ def has_query?
64
+ %w(http https).include?(@uri.scheme) && query
65
+ end
66
+
67
+ def canonicalize!(options = {})
68
+ @uri.scheme = 'http' if @uri.scheme == 'https'
69
+
70
+ @uri.path = @uri.path.sub(/\/*$/, '') if @uri.path =~ /^*\/$/
71
+ @uri.path.gsub!(BLURI.path_escape_char_regex, PATH_ESCAPE_MAPPINGS)
72
+ @uri.path.gsub!(BLURI.path_unescape_code_regex, PATH_UNESCAPE_MAPPINGS)
73
+
74
+ canonicalize_query!(options)
75
+
76
+ @uri.fragment = nil
77
+ self
78
+ end
79
+
80
+ def canonicalize_query!(options)
81
+ allow_all = (options[:allow_query] == :all)
82
+ allowed_keys = [options[:allow_query]].flatten.compact unless allow_all
83
+
84
+ query_hash.keep_if do |k, _|
85
+ allow_all || (allowed_keys.include?(k) || allowed_keys.include?(k.to_sym))
86
+ end
87
+
88
+ self.query_hash = QueryHash[query_hash.sort_by { |k, _| k }]
89
+ end
90
+
91
+ ##
92
+ # Generate a regex which matches all characters in PATH_ESCAPE_MAPPINGS
93
+ def self.path_escape_char_regex
94
+ @path_escape_char_regex ||=
95
+ Regexp.new('[' + PATH_ESCAPE_MAPPINGS.keys.map do |char|
96
+ REQUIRE_REGEX_ESCAPE.include?(char) ? "\\#{char}" : char
97
+ end.join + ']')
98
+ end
99
+
100
+ ##
101
+ # Generate a regex which matches all escape sequences in PATH_UNESCAPE_MAPPINGS
102
+ def self.path_unescape_code_regex
103
+ @path_unescape_code_regex ||= Regexp.new(
104
+ PATH_UNESCAPE_MAPPINGS.keys.map { |code| "(?:#{code})" }.join('|')
105
+ )
106
+ end
107
+ end
108
+ end
109
+
110
+ module Kernel
111
+ def BLURI(uri_str)
112
+ ::URI::BLURI.parse(uri_str)
113
+ end
114
+
115
+ module_function :BLURI
116
+ end
@@ -0,0 +1,33 @@
1
+ module URI
2
+ ##
3
+ # Extends a hash with query string rendering/semi-indifferent access
4
+ module QueryHash
5
+ def [](key)
6
+ item = super key
7
+ item = super(key.to_s) if item.nil? || item.length == 0
8
+ item.class == Array && item.length == 0 ? nil : item
9
+ end
10
+
11
+ def to_s
12
+ keys.map { |key| render_value(key, self[key]) }.join('&')
13
+ end
14
+
15
+ ##
16
+ # Creates a new hash populated with the given objects.
17
+ def self.[](value)
18
+ Hash[value].tap do |hash|
19
+ hash.extend(QueryHash)
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def render_value(key, value)
26
+ case value
27
+ when nil then key
28
+ when Array then value.map { |el| render_value(key, el) }.join('&')
29
+ else URI.encode_www_form_component(key) << '=' << URI.encode_www_form_component(value)
30
+ end
31
+ end
32
+ end
33
+ end
data/optic14n.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'optic14n/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'optic14n'
8
+ spec.version = Optic14n::VERSION
9
+ spec.authors = ['Russell Garner']
10
+ spec.email = %w(rgarner@zephyros-systems.co.uk)
11
+ spec.description = %q{Canonicalises URLs.}
12
+ spec.summary = %q{Specifically, HTTP URLs, for a limited purpose}
13
+ spec.homepage = ''
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = %w(lib)
20
+
21
+ spec.add_dependency 'addressable', '~> 2.3'
22
+
23
+ spec.add_development_dependency 'rake'
24
+ spec.add_development_dependency 'gem_publisher', '~> 1.3.0'
25
+ end
@@ -0,0 +1,88 @@
1
+ require 'spec_helper'
2
+
3
+ describe URI::BLURI do
4
+ it 'should be an HTTP URI' do
5
+ bluri = BLURI('http://some.where.com')
6
+ bluri.should be_a URI::HTTP
7
+ end
8
+
9
+ it 'should not allow other schemes' do
10
+ lambda { BLURI('ftp://foo').should raise_error(ArgumentError) }
11
+ end
12
+
13
+ it 'should not allow nil' do
14
+ lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError)
15
+ end
16
+
17
+ it 'supports scheme' do
18
+ BLURI('http://foo').scheme.should == 'http'
19
+ end
20
+ it 'supports host' do
21
+ BLURI('http://foo').host.should == 'foo'
22
+ end
23
+ it 'supports path' do
24
+ BLURI('http://foo/a/path').path.should == '/a/path'
25
+ end
26
+ it 'supports query' do
27
+ BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo'
28
+ end
29
+ it 'supports fragment' do
30
+ BLURI('http://foo#fragment').fragment.should == 'fragment'
31
+ end
32
+ it 'supports mailto:someone@somewhere' do
33
+ BLURI('mailto:me@there.com').to_s.should == 'mailto:me@there.com'
34
+ end
35
+ it 'corrects unencoded ampersands ins mailto' do # http://www.faqs.org/rfcs/rfc2368.html
36
+ BLURI('mailto:fruit&veg.newcastle@rpa.gsi.gov.uk').to_s.should == 'mailto:fruit%26veg.newcastle@rpa.gsi.gov.uk'
37
+ end
38
+ it 'corrects trailing spaces' do
39
+ BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk'
40
+ end
41
+ it 'corrects leading spaces' do
42
+ BLURI(' http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk'
43
+ end
44
+
45
+ describe 'Query string parsing' do
46
+ context 'the query string is of HTML-encoded form k=v&q=p' do
47
+ before do
48
+ @bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE')
49
+ end
50
+
51
+ it 'indexes the query string' do
52
+ @bluri.query_hash['itemid'].should == '1'
53
+ end
54
+
55
+ it 'allows indexing by symbol' do
56
+ @bluri.query_hash[:itemid].should == '1'
57
+ end
58
+
59
+ it 'shows nil for absent items' do
60
+ @bluri.query_hash[:eerie_flash].should == nil
61
+ end
62
+
63
+ it 'indexes the second query string item' do
64
+ @bluri.query_hash['type'].should == 'resource'
65
+ end
66
+
67
+ it 'allows setting of the query' do
68
+ @bluri.query = 'furry=really'
69
+ @bluri.to_s.should == 'http://some.com/a/path?furry=really'
70
+ end
71
+ end
72
+
73
+ context 'the querystring is not an HTML-encoded thing' do
74
+ before do
75
+ @bluri = BLURI('http://some.com/a/path?foo&bar')
76
+ end
77
+
78
+ it 'retains the query string' do
79
+ @bluri.query.should == 'foo&bar'
80
+ end
81
+
82
+ it 'has a query hash with empty elements' do
83
+ @bluri.query_hash['foo'].should == nil
84
+ @bluri.query_hash['foo'].should == nil
85
+ end
86
+ end
87
+ end
88
+ end
data/spec/c14n.t ADDED
@@ -0,0 +1,71 @@
1
+ # Here for reference, see original at
2
+ # https://github.com/alphagov/redirector/blob/master/tests/lib/c14n.t
3
+
4
+ use strict;
5
+ use Test::More;
6
+ require 'lib/c14n.pl';
7
+
8
+ #
9
+ # case
10
+ #
11
+ is(c14n_url("http://www.EXAMPLE.COM/Foo/Bar/BAZ"), "http://www.example.com/foo/bar/baz", "c14n URL is lower-case");
12
+
13
+ #
14
+ # protocol
15
+ #
16
+ is(c14n_url("https://www.example.com"), "http://www.example.com", "translates protocol to http");
17
+
18
+ #
19
+ # slashes
20
+ #
21
+ is(c14n_url("http://www.example.com/"), "http://www.example.com", "drops trailing slash");
22
+ is(c14n_url("http://www.example.com////"), "http://www.example.com", "drops multiple trailing slashes");
23
+
24
+ #
25
+ # fragment identifier
26
+ #
27
+ is(c14n_url("http://www.example.com#foo"), "http://www.example.com", "drops fragment identifier");
28
+ is(c14n_url("http://www.example.com/#foo"), "http://www.example.com", "drops fragment identifier and slashes");
29
+
30
+ #
31
+ # encoding
32
+ #
33
+ is(c14n_url("http://www.example.com/:colon:"), "http://www.example.com/:colon:", "colons");
34
+ is(c14n_url("http://www.example.com/~tide"), "http://www.example.com/~tide", "tide");
35
+ is(c14n_url("http://www.example.com/_underscore_"), "http://www.example.com/_underscore_", "underscore");
36
+ is(c14n_url("http://www.example.com/*asterisk*"), "http://www.example.com/*asterisk*", "asterisk");
37
+ is(c14n_url("http://www.example.com/(parens)"), "http://www.example.com/(parens)", "parens");
38
+ is(c14n_url("http://www.example.com/[square-brackets]"), "http://www.example.com/%5bsquare-brackets%5d", "square-brackets");
39
+
40
+ is(c14n_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27', "commas and quotes");
41
+ is(c14n_url("http://www.example.com/problematic-in-curl[]||[and-regexes]"), "http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d", "square brackets and pipes");
42
+ is(c14n_url("http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21"),
43
+ 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!',
44
+ "non-reserved character percent decoding");
45
+
46
+ is(c14n_url("https://www.example.com/pound-sign-£"), "http://www.example.com/pound-sign-%c2%a3", "pound sign");
47
+
48
+ #
49
+ # query_strings
50
+ #
51
+ is(c14n_url("http://www.example.com?q=foo"), "http://www.example.com", "drops disallowed query-string");
52
+ is(c14n_url("http://www.example.com/?q=foo"), "http://www.example.com", "drops disallowed query-string after slash");
53
+ is(c14n_url("http://www.example.com/?q=foo#bar"), "http://www.example.com", "drops disallowed query-string after a slash with fragid");
54
+
55
+ is(c14n_url("http://www.example.com?a=1&c=3&b=2", '*'), "http://www.example.com?a=1&b=2&c=3", "query string wildcard value");
56
+
57
+ is(c14n_url("http://www.example.com/?q=foo", "q"), "http://www.example.com?q=foo", "allow named query_string parameter");
58
+
59
+ is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "sorts query_string values");
60
+ is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", " b e,c:d, a "), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "accept colon and space separated allowed values");
61
+ is(c14n_url("http://www.example.com?c=23;d=1;b=909;e=33;a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "converts matrix URI to query_string");
62
+
63
+ is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "topic,item"), "http://www.example.com?item=23444&topic=334499", "allows cherry-picked query_string");
64
+ is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "foo,bar,baz"), "http://www.example.com", "no ? for empty query_string values");
65
+
66
+ is(c14n_url("http://www.example.com?a=you're_dangerous", '*'), "http://www.example.com?a=you%27re_dangerous", "escape query string values");
67
+
68
+ #
69
+ # normalise url
70
+ #
71
+ is(normalise_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-CSV-harder-to-%27awk%27', "commas and quotes");
data/spec/c14n_spec.rb ADDED
@@ -0,0 +1,163 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe "Paul's tests, translated from Perl" do
6
+ it 'lowercases URLs' do
7
+ BLURI('http://www.EXAMPLE.COM/Foo/Bar/BAZ').canonicalize!.to_s.should == 'http://www.example.com/foo/bar/baz'
8
+ end
9
+
10
+ describe 'protocol' do
11
+ it 'translates protocol to http', reason: 'Reduces our input space, everything public anyway' do
12
+ BLURI('https://www.example.com').canonicalize!.to_s.should == 'http://www.example.com'
13
+ end
14
+ end
15
+
16
+ describe 'slashes' do
17
+ it 'drops single trailing slashes' do
18
+ BLURI('http://www.example.com/').canonicalize!.to_s.should == 'http://www.example.com'
19
+ end
20
+
21
+ it 'drops multiple trailing slashes' do
22
+ BLURI('http://www.example.com////').canonicalize!.to_s.should == 'http://www.example.com'
23
+ end
24
+
25
+ it 'drops multiple trailing slashes on the path' do
26
+ BLURI('http://www.example.com/foo///').canonicalize!.to_s.should == 'http://www.example.com/foo'
27
+ end
28
+ end
29
+
30
+ describe 'fragments' do
31
+ it 'drops fragment identifier', reason: 'They won''t be mapped, so are redundant' do
32
+ BLURI('http://www.example.com#foo').canonicalize!.to_s.should == 'http://www.example.com'
33
+ end
34
+ it 'drops fragment identifier and slashes' do
35
+ BLURI('http://www.example.com/#foo').canonicalize!.to_s.should == 'http://www.example.com'
36
+ end
37
+ end
38
+
39
+ describe 'Things to keep verbatim or encode', reason: 'http://tools.ietf.org/html/rfc3986' do
40
+ it 'retains colons' do
41
+ BLURI('http://www.example.com/:colon:').canonicalize!.to_s.should == 'http://www.example.com/:colon:'
42
+ end
43
+ it 'retains tilde' do
44
+ BLURI('http://www.example.com/~tilde').canonicalize!.to_s.should == 'http://www.example.com/~tilde'
45
+ end
46
+ it 'retains underscores' do
47
+ BLURI('http://www.example.com/_underscore_').canonicalize!.to_s.should == 'http://www.example.com/_underscore_'
48
+ end
49
+ it 'retains asterisks' do
50
+ BLURI('http://www.example.com/*asterisk*').canonicalize!.to_s.should == 'http://www.example.com/*asterisk*'
51
+ end
52
+ it 'retains parens' do
53
+ BLURI('http://www.example.com/(parens)').canonicalize!.to_s.should == 'http://www.example.com/(parens)'
54
+ end
55
+ it 'escapes square brackets' do
56
+ BLURI('http://www.example.com/[square-brackets]').canonicalize!.to_s.should == 'http://www.example.com/%5bsquare-brackets%5d'
57
+ end
58
+ it 'encodes commas and quotes', reason: 'They make csv harder to awk' do
59
+ BLURI("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'").canonicalize!.to_s.should ==
60
+ 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27'
61
+ end
62
+ it 'encodes square brackets and pipes', reason: "It's problematic in curl and regexes" do
63
+ BLURI('http://www.example.com/problematic-in-curl[]||[and-regexes]').canonicalize!.to_s.should ==
64
+ 'http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d'
65
+ end
66
+ it 'decodes non-reserved characters (! and ~)' do
67
+ # My god, it's full of stars
68
+ BLURI('http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21').
69
+ canonicalize!.to_s.should == 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!'
70
+ end
71
+ it 'encodes pound signs' do
72
+ BLURI('https://www.example.com/pound-sign-£').canonicalize!.to_s.should == 'http://www.example.com/pound-sign-%c2%a3'
73
+ end
74
+ end
75
+
76
+ describe 'query strings' do
77
+ it 'disallows all query string params by default' do
78
+ BLURI('http://www.example.com?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
79
+ end
80
+ it 'disallows all params when there''s a slash' do
81
+ BLURI('http://www.example.com/?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
82
+ end
83
+ it 'disallows all params after a slash with fragid' do
84
+ BLURI('http://www.example.com/?q=foo#bar').canonicalize!.to_s.should == 'http://www.example.com'
85
+ end
86
+
87
+ describe 'allowing some or all query string values' do
88
+ it 'allows named query_string parameters' do
89
+ BLURI('http://www.example.com/?q=foo&r=bar').canonicalize!(allow_query: 'q').to_s.should ==
90
+ 'http://www.example.com?q=foo'
91
+ end
92
+ it 'sorts query string values' do
93
+ BLURI('http://www.example.com?c=23&d=1&b=909&e=33&a=1').
94
+ canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
95
+ end
96
+ it 'encodes querystring values' do
97
+ BLURI("http://www.example.com?a=you're_dangerous").canonicalize!(allow_query: :all).to_s.should ==
98
+ 'http://www.example.com?a=you%27re_dangerous'
99
+ end
100
+ it 'whitelists and sorts query strings' do
101
+ BLURI('http://www.example.com?a=1&c=3&b=2').canonicalize!(allow_query: :all).to_s.should ==
102
+ 'http://www.example.com?a=1&b=2&c=3'
103
+ end
104
+ it 'converts matrix URI to query_string' do
105
+ BLURI('http://www.example.com?c=23;d=1;b=909;e=33;a=1').
106
+ canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
107
+ end
108
+ it 'sorts cherry-picked query string arguments' do
109
+ BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
110
+ canonicalize!(allow_query: [:topic, :item]).to_s.should == 'http://www.example.com?item=23444&topic=334499'
111
+ end
112
+ it 'ignores empty querystring values' do
113
+ BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
114
+ canonicalize!(allow_query: %w(foo bar baz)).to_s.should == 'http://www.example.com'
115
+ end
116
+
117
+ describe 'querystrings that are not an HTML-encoded thing' do
118
+ before do
119
+ @bluri = BLURI('http://some.com/a/path?foo&bar').canonicalize!(allow_query: :all)
120
+ end
121
+
122
+ it 'retains the query string' do
123
+ @bluri.query.should == 'bar&foo'
124
+ end
125
+
126
+ it 'has a query hash with empty elements' do
127
+ @bluri.query_hash['foo'].should == nil
128
+ @bluri.query_hash['bar'].should == nil
129
+ end
130
+
131
+ it 'renders the string properly' do
132
+ @bluri.query_hash.to_s.should == 'bar&foo'
133
+ end
134
+ end
135
+ end
136
+
137
+ describe 'degenerate cases' do
138
+ describe 'the treatment of query strings when there are query string octets that unescape to '\
139
+ 'invalid UTF-8 sequences (we no longer treat these as failures)' do
140
+ it 'no longer raises exceptions when there are bad things in query values' do
141
+ BLURI('http://example.com/path?view=%ED').
142
+ canonicalize!(allow_query: :all).
143
+ to_s.should eql('http://example.com/path?view=%ED')
144
+ end
145
+
146
+ it 're-encodes correctly when there are bad things in query keys' do
147
+ BLURI('http://example.com/path?%ED=view').
148
+ canonicalize!(allow_query: :all).
149
+ to_s.should eql('http://example.com/path?%ED=view')
150
+ end
151
+ end
152
+
153
+ describe 'failure to canonicalize paths correctly' do
154
+ # see https://www.pivotaltracker.com/s/projects/860575/stories/54502932
155
+
156
+ subject { BLURI('http://www.voa.gov.uk/stuff/?query=thing').canonicalize!(allow_query: :all) }
157
+
158
+ its(:path) { should eql('/stuff') }
159
+ its(:query) { should eql('query=thing') }
160
+ end
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,59 @@
1
+ require 'spec_helper'
2
+
3
+ describe Optic14n::CanonicalizedUrls do
4
+ describe 'c14nize' do
5
+ let(:test_urls) do
6
+ %w(
7
+ http://www.qhm.mod.uk/portsmouth/leisure/fuel
8
+ http://www.qhm.mod.uk/portsmouth/leisure/lntm?
9
+ http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view
10
+ http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view&id=199
11
+ http://unistats.direct.gov.uk/searchResults.do?pname=institutesearchresults&level3Subjects=L3.90%AC10007761%ACFIRSTDEGREE%ACFulltime%AC430%ACNo%AC60%ACYes%AC83%ACNo%ACYes
12
+ 1234://123
13
+ )
14
+ end
15
+
16
+ context 'options[:allow_query] is false' do
17
+ subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: false) }
18
+
19
+ it { should be_a(Optic14n::CanonicalizedUrls) }
20
+
21
+ its(:seen) { should eql(6) }
22
+
23
+ describe 'the output set' do
24
+ subject(:output_set) { c14nizer.output_set }
25
+
26
+ its(:size) { should eql(3) }
27
+
28
+ describe 'the items' do
29
+ subject { output_set.map(&:to_s) }
30
+
31
+ it { should include('http://www.qhm.mod.uk/portsmouth/leisure/fuel') }
32
+ it { should include('http://www.qhm.mod.uk/portsmouth/leisure/lntm') }
33
+ it { should include('http://unistats.direct.gov.uk/searchresults.do') }
34
+ end
35
+ end
36
+ end
37
+
38
+ context 'options[:allow_query] is :all' do
39
+ subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: :all) }
40
+
41
+ describe 'the output set' do
42
+ subject(:output_set) { c14nizer.output_set }
43
+
44
+ its(:size) { should eql(5) }
45
+ end
46
+
47
+ describe 'failures' do
48
+ subject(:failures) { c14nizer.failures }
49
+
50
+ it { should be_a(Hash) }
51
+
52
+ it 'has our last URL and an error' do
53
+ e = failures[test_urls.last]
54
+ e.should be_an(Addressable::URI::InvalidURIError)
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe URI::QueryHash do
4
+ subject(:hash) { {}.extend URI::QueryHash }
5
+
6
+ its(:to_s) { should eql('') }
7
+
8
+ describe 'setting a value by symbol' do
9
+ before { hash['x'] = '1' }
10
+
11
+ its([:x]) { should eql('1') }
12
+ its(['x']) { should eql('1') }
13
+ its(:to_s) { should eql('x=1') }
14
+ end
15
+ end
@@ -0,0 +1 @@
1
+ require 'optic14n'
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ describe URI::QueryHash do
4
+ describe 'non-HTML encoded query strings' do
5
+ subject { { 'foo' => nil, 'bar' => nil }.extend URI::QueryHash }
6
+
7
+ its(['foo']) { should be_nil }
8
+ its(['bar']) { should be_nil }
9
+ its(:to_s) { should eql('foo&bar') }
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: optic14n
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Russell Garner
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: addressable
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.3'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: gem_publisher
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: 1.3.0
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: 1.3.0
62
+ description: Canonicalises URLs.
63
+ email:
64
+ - rgarner@zephyros-systems.co.uk
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - Gemfile
71
+ - LICENSE.txt
72
+ - README.md
73
+ - Rakefile
74
+ - jenkins.sh
75
+ - lib/optic14n.rb
76
+ - lib/optic14n/canonicalized_urls.rb
77
+ - lib/optic14n/version.rb
78
+ - lib/tasks/measure_reduction.rake
79
+ - lib/uri/bluri.rb
80
+ - lib/uri/query_hash.rb
81
+ - optic14n.gemspec
82
+ - spec/bluri_spec.rb
83
+ - spec/c14n.t
84
+ - spec/c14n_spec.rb
85
+ - spec/canonicalized_urls_spec.rb
86
+ - spec/query_hash_spec.rb
87
+ - spec/spec_helper.rb
88
+ - spec/uri/query_hash_spec.rb
89
+ homepage: ''
90
+ licenses:
91
+ - MIT
92
+ post_install_message:
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ segments:
103
+ - 0
104
+ hash: 2602697415991458495
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ none: false
107
+ requirements:
108
+ - - ! '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ segments:
112
+ - 0
113
+ hash: 2602697415991458495
114
+ requirements: []
115
+ rubyforge_project:
116
+ rubygems_version: 1.8.23
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Specifically, HTTP URLs, for a limited purpose
120
+ test_files:
121
+ - spec/bluri_spec.rb
122
+ - spec/c14n.t
123
+ - spec/c14n_spec.rb
124
+ - spec/canonicalized_urls_spec.rb
125
+ - spec/query_hash_spec.rb
126
+ - spec/spec_helper.rb
127
+ - spec/uri/query_hash_spec.rb