RubyGems - optic14n - Versions diffs - 2.0.0 - Mend

optic14n 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

data/.gitignore +17 -0
data/Gemfile +8 -0
data/LICENSE.txt +22 -0
data/README.md +60 -0
data/Rakefile +19 -0
data/jenkins.sh +10 -0
data/lib/optic14n/canonicalized_urls.rb +45 -0
data/lib/optic14n/version.rb +3 -0
data/lib/optic14n.rb +10 -0
data/lib/tasks/measure_reduction.rake +15 -0
data/lib/uri/bluri.rb +116 -0
data/lib/uri/query_hash.rb +33 -0
data/optic14n.gemspec +25 -0
data/spec/bluri_spec.rb +88 -0
data/spec/c14n.t +71 -0
data/spec/c14n_spec.rb +163 -0
data/spec/canonicalized_urls_spec.rb +59 -0
data/spec/query_hash_spec.rb +15 -0
data/spec/spec_helper.rb +1 -0
data/spec/uri/query_hash_spec.rb +11 -0
metadata +127 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,17 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in optic14n.gemspec
+gemspec
+group :test do
+  gem 'rspec'
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2013 Government Digital Service
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,60 @@
+# Optic14n
+Canonicalises URLs.
+## Installation
+Add this line to your application's Gemfile:
+    gem 'optic14n'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install optic14n
+## Usage
+Parse a `BLURI` like this:
+```ruby
+  bluri = BLURI('http://somewhere.com/?a=1&b=2&c=3')
+```
+Canonicalize it according to the [Previously-Established Rules](#the-previously-established-rules) thusly:
+```ruby
+  bluri.canonicalize!
+```
+You can also do site-specific stuff if you know some of the querystring will be valuable
+```ruby
+  bluri.canonicalize!(allow_query: :all)
+```
+```ruby
+  bluri.canonicalize!(allow_query: [:a, :c])
+  # or
+  bluri.canonicalize!(allow_query: ['a', 'c'])
+```
+### The previously-established rules
+This is a gem for canonicalising HTTP URIs such that we can boil our input set of URIs down to something that is much
+smaller than it would otherwise be. We do this aggressively by:
+* lowercasing URIs
+* removing query strings (unless told otherwise)
+* removing fragments
+* escaping and unescaping various characters and escape sequences according to RFC3986
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request

data/Rakefile ADDED Viewed

@@ -0,0 +1,19 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'optic14n'
+Dir.glob('lib/tasks/*.rake').each { |r| import r }
+require 'gem_publisher'
+desc 'Publish gem to Rubygems'
+task :publish_gem do
+  gem = GemPublisher.publish_if_updated('optic14n.gemspec', :rubygems)
+  puts "Published #{gem}" if gem
+end
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec
+task test: :spec

data/jenkins.sh ADDED Viewed

@@ -0,0 +1,10 @@
+#!/bin/bash -x
+export RAILS_ENV=test
+export DISPLAY=":99"
+set -e
+rm -f Gemfile.lock
+bundle install --path "${HOME}/bundles/${JOB_NAME}"
+export GOVUK_APP_DOMAIN=dev.gov.uk
+bundle exec rake
+bundle exec rake publish_gem

data/lib/optic14n/canonicalized_urls.rb ADDED Viewed

@@ -0,0 +1,45 @@
+module Optic14n
+  ##
+  # Canonicalizes a set of URLs
+  class CanonicalizedUrls
+    attr_reader :output_set, :seen, :failures, :each
+    extend Forwardable
+    def_delegators :@output_set, :size
+    def initialize(urls, options)
+      @urls = urls
+      @options = options
+    end
+    def canonicalize!
+      @seen = 0
+      @failures = {}
+      @output_set = Set.new
+      @urls.each do |url|
+        begin
+          @output_set.add(BLURI(url).canonicalize!(@options))
+        rescue Exception => e
+          failures[url] = e
+        end
+        @seen += 1
+      end
+    end
+    def write(filename)
+      File.open(filename, 'w') do |file|
+        @output_set.each do |url|
+          file.puts url
+        end
+      end
+    end
+    ##
+    # Canonicalize given urls. +options+ will be passed to +BLURI.parse+
+    def self.from_urls(urls, options = {})
+      CanonicalizedUrls.new(urls, options).tap { |c| c.canonicalize! }
+    end
+  end
+end

data/lib/optic14n/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Optic14n
+  VERSION = '2.0.0'
+end

data/lib/optic14n.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'optic14n/version'
+require 'uri'
+require 'addressable/uri'
+require 'cgi'
+require 'forwardable'
+require 'uri/query_hash'
+require 'uri/bluri'
+require 'optic14n/canonicalized_urls'

data/lib/tasks/measure_reduction.rake ADDED Viewed

@@ -0,0 +1,15 @@
+require 'set'
+namespace :opt do
+  desc 'Measure reduction from canonicalisation'
+  task :measure, [:filename, :output_file] do |_, args|
+    filename = args[:filename]
+    output_file = args[:output_file]
+    Optic14n::CanonicalizedUrls.from_urls(File.read(filename).each_line).tap do |urls|
+      urls.write(output_file) if output_file
+      puts "#{urls.seen} urls seen, #{urls.size} after canonicalisation"
+    end
+  end
+end

data/lib/uri/bluri.rb ADDED Viewed

@@ -0,0 +1,116 @@
+# encoding: utf-8
+module URI
+  ##
+  # A URI class with a bit extra for canonicalising query strings
+  #
+  class BLURI < URI::HTTP
+    PATH_ESCAPE_MAPPINGS = {
+      '[' => '%5b',
+      ']' => '%5d',
+      ',' => '%2c',
+      '"' => '%22',
+      "'" => '%27',
+      '|' => '%7c',
+      '!' => '%21',
+      '£' => '%c2%a3'
+    }
+    PATH_UNESCAPE_MAPPINGS = {
+      '%7e' => '~',
+      '%21' => '!'
+    }
+    REQUIRE_REGEX_ESCAPE = %w<. | ( ) [ ] { } + \ ^ $ * ?> & PATH_ESCAPE_MAPPINGS.keys
+    extend Forwardable
+    def_delegators :@uri, :scheme, :path, :host, :host=, :query, :fragment, :to_s
+    def initialize(uri_str)
+      @uri = ::Addressable::URI.parse(uri_str)
+      raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri
+    end
+    def query_hash
+      @query_hash ||= CGI::parse(self.query || '').tap do |query_hash|
+        # By default, CGI::parse produces lots of arrays. Usually they have a single element
+        # in them. That's correct but not terribly usable. Fix it here.
+        query_hash.each_pair { |k, v| query_hash[k] = v[0] if v.length == 1 }
+        query_hash.extend QueryHash
+      end
+    end
+    def query_hash=(value)
+      @query_hash = value
+      @uri.query = @query_hash.to_s == '' ? nil : @query_hash.to_s
+    end
+    def query=(query_str)
+      @query_hash = nil
+      @uri.query = query_str == '' ? nil : query_str
+    end
+    def self.parse(uri_str)
+      # Deal with known URI spec breaks - leading/trailing spaces and unencoded entities
+      if uri_str.is_a? String
+        uri_str = uri_str.strip.downcase.gsub(' ', '%20')
+        uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/
+      end
+      BLURI.new(uri_str)
+    end
+    def has_query?
+      %w(http https).include?(@uri.scheme) && query
+    end
+    def canonicalize!(options = {})
+      @uri.scheme = 'http' if @uri.scheme == 'https'
+      @uri.path = @uri.path.sub(/\/*$/, '') if @uri.path =~ /^*\/$/
+      @uri.path.gsub!(BLURI.path_escape_char_regex,   PATH_ESCAPE_MAPPINGS)
+      @uri.path.gsub!(BLURI.path_unescape_code_regex, PATH_UNESCAPE_MAPPINGS)
+      canonicalize_query!(options)
+      @uri.fragment = nil
+      self
+    end
+    def canonicalize_query!(options)
+      allow_all = (options[:allow_query] == :all)
+      allowed_keys = [options[:allow_query]].flatten.compact unless allow_all
+      query_hash.keep_if do |k, _|
+        allow_all || (allowed_keys.include?(k) || allowed_keys.include?(k.to_sym))
+      end
+      self.query_hash = QueryHash[query_hash.sort_by { |k, _| k }]
+    end
+    ##
+    # Generate a regex which matches all characters in PATH_ESCAPE_MAPPINGS
+    def self.path_escape_char_regex
+      @path_escape_char_regex ||=
+          Regexp.new('[' + PATH_ESCAPE_MAPPINGS.keys.map do |char|
+            REQUIRE_REGEX_ESCAPE.include?(char) ? "\\#{char}" : char
+          end.join + ']')
+    end
+    ##
+    # Generate a regex which matches all escape sequences in PATH_UNESCAPE_MAPPINGS
+    def self.path_unescape_code_regex
+      @path_unescape_code_regex ||= Regexp.new(
+        PATH_UNESCAPE_MAPPINGS.keys.map { |code| "(?:#{code})" }.join('|')
+      )
+    end
+  end
+end
+module Kernel
+  def BLURI(uri_str)
+    ::URI::BLURI.parse(uri_str)
+  end
+  module_function :BLURI
+end

data/lib/uri/query_hash.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module URI
+  ##
+  # Extends a hash with query string rendering/semi-indifferent access
+  module QueryHash
+    def [](key)
+      item = super key
+      item = super(key.to_s) if item.nil? || item.length == 0
+      item.class == Array && item.length == 0 ? nil : item
+    end
+    def to_s
+      keys.map { |key| render_value(key, self[key]) }.join('&')
+    end
+    ##
+    # Creates a new hash populated with the given objects.
+    def self.[](value)
+      Hash[value].tap do |hash|
+        hash.extend(QueryHash)
+      end
+    end
+    private
+    def render_value(key, value)
+      case value
+        when nil   then key
+        when Array then value.map { |el| render_value(key, el) }.join('&')
+        else            URI.encode_www_form_component(key) << '=' << URI.encode_www_form_component(value)
+      end
+    end
+  end
+end

data/optic14n.gemspec ADDED Viewed

@@ -0,0 +1,25 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'optic14n/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'optic14n'
+  spec.version       = Optic14n::VERSION
+  spec.authors       = ['Russell Garner']
+  spec.email         = %w(rgarner@zephyros-systems.co.uk)
+  spec.description   = %q{Canonicalises URLs.}
+  spec.summary       = %q{Specifically, HTTP URLs, for a limited purpose}
+  spec.homepage      = ''
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = %w(lib)
+  spec.add_dependency 'addressable', '~> 2.3'
+  spec.add_development_dependency 'rake'
+  spec.add_development_dependency 'gem_publisher', '~> 1.3.0'
+end

data/spec/bluri_spec.rb ADDED Viewed

@@ -0,0 +1,88 @@
+require 'spec_helper'
+describe URI::BLURI do
+  it 'should be an HTTP URI' do
+    bluri = BLURI('http://some.where.com')
+    bluri.should be_a URI::HTTP
+  end
+  it 'should not allow other schemes' do
+    lambda { BLURI('ftp://foo').should raise_error(ArgumentError) }
+  end
+  it 'should not allow nil' do
+    lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError)
+  end
+  it 'supports scheme' do
+    BLURI('http://foo').scheme.should == 'http'
+  end
+  it 'supports host' do
+    BLURI('http://foo').host.should == 'foo'
+  end
+  it 'supports path' do
+    BLURI('http://foo/a/path').path.should == '/a/path'
+  end
+  it 'supports query' do
+    BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo'
+  end
+  it 'supports fragment' do
+    BLURI('http://foo#fragment').fragment.should == 'fragment'
+  end
+  it 'supports mailto:someone@somewhere' do
+    BLURI('mailto:me@there.com').to_s.should == 'mailto:me@there.com'
+  end
+  it 'corrects unencoded ampersands ins mailto' do # http://www.faqs.org/rfcs/rfc2368.html
+    BLURI('mailto:fruit&veg.newcastle@rpa.gsi.gov.uk').to_s.should == 'mailto:fruit%26veg.newcastle@rpa.gsi.gov.uk'
+  end
+  it 'corrects trailing spaces' do
+    BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk'
+  end
+  it 'corrects leading spaces' do
+    BLURI('  http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk'
+  end
+  describe 'Query string parsing' do
+    context 'the query string is of HTML-encoded form k=v&q=p' do
+      before do
+        @bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE')
+      end
+      it 'indexes the query string' do
+        @bluri.query_hash['itemid'].should == '1'
+      end
+      it 'allows indexing by symbol' do
+        @bluri.query_hash[:itemid].should == '1'
+      end
+      it 'shows nil for absent items' do
+        @bluri.query_hash[:eerie_flash].should == nil
+      end
+      it 'indexes the second query string item' do
+        @bluri.query_hash['type'].should == 'resource'
+      end
+      it 'allows setting of the query' do
+        @bluri.query = 'furry=really'
+        @bluri.to_s.should == 'http://some.com/a/path?furry=really'
+      end
+    end
+    context 'the querystring is not an HTML-encoded thing' do
+      before do
+        @bluri = BLURI('http://some.com/a/path?foo&bar')
+      end
+      it 'retains the query string' do
+        @bluri.query.should == 'foo&bar'
+      end
+      it 'has a query hash with empty elements' do
+        @bluri.query_hash['foo'].should == nil
+        @bluri.query_hash['foo'].should == nil
+      end
+    end
+  end
+end

data/spec/c14n.t ADDED Viewed

@@ -0,0 +1,71 @@
+# Here for reference, see original at
+# https://github.com/alphagov/redirector/blob/master/tests/lib/c14n.t
+use strict;
+use Test::More;
+require 'lib/c14n.pl';
+#
+#  case
+#
+is(c14n_url("http://www.EXAMPLE.COM/Foo/Bar/BAZ"), "http://www.example.com/foo/bar/baz", "c14n URL is lower-case");
+#
+#  protocol
+#
+is(c14n_url("https://www.example.com"), "http://www.example.com", "translates protocol to http");
+#
+#  slashes
+#
+is(c14n_url("http://www.example.com/"), "http://www.example.com", "drops trailing slash");
+is(c14n_url("http://www.example.com////"), "http://www.example.com", "drops multiple trailing slashes");
+#
+#  fragment identifier
+#
+is(c14n_url("http://www.example.com#foo"), "http://www.example.com", "drops fragment identifier");
+is(c14n_url("http://www.example.com/#foo"), "http://www.example.com", "drops fragment identifier and slashes");
+#
+#  encoding
+#
+is(c14n_url("http://www.example.com/:colon:"), "http://www.example.com/:colon:", "colons");
+is(c14n_url("http://www.example.com/~tide"), "http://www.example.com/~tide", "tide");
+is(c14n_url("http://www.example.com/_underscore_"), "http://www.example.com/_underscore_", "underscore");
+is(c14n_url("http://www.example.com/*asterisk*"), "http://www.example.com/*asterisk*", "asterisk");
+is(c14n_url("http://www.example.com/(parens)"), "http://www.example.com/(parens)", "parens");
+is(c14n_url("http://www.example.com/[square-brackets]"), "http://www.example.com/%5bsquare-brackets%5d", "square-brackets");
+is(c14n_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27', "commas and quotes");
+is(c14n_url("http://www.example.com/problematic-in-curl[]||[and-regexes]"), "http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d", "square brackets and pipes");
+is(c14n_url("http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21"),
+            'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!',
+            "non-reserved character percent decoding");
+is(c14n_url("https://www.example.com/pound-sign-£"), "http://www.example.com/pound-sign-%c2%a3", "pound sign");
+#
+#  query_strings
+#
+is(c14n_url("http://www.example.com?q=foo"), "http://www.example.com", "drops disallowed query-string");
+is(c14n_url("http://www.example.com/?q=foo"), "http://www.example.com", "drops disallowed query-string after slash");
+is(c14n_url("http://www.example.com/?q=foo#bar"), "http://www.example.com", "drops disallowed query-string after a slash with fragid");
+is(c14n_url("http://www.example.com?a=1&c=3&b=2", '*'), "http://www.example.com?a=1&b=2&c=3", "query string wildcard value");
+is(c14n_url("http://www.example.com/?q=foo", "q"), "http://www.example.com?q=foo", "allow named query_string parameter");
+is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "sorts query_string values");
+is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", "  b e,c:d, a  "), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "accept colon and space separated allowed values");
+is(c14n_url("http://www.example.com?c=23;d=1;b=909;e=33;a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "converts matrix URI to query_string");
+is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "topic,item"), "http://www.example.com?item=23444&topic=334499", "allows cherry-picked  query_string");
+is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "foo,bar,baz"), "http://www.example.com", "no ? for empty query_string values");
+is(c14n_url("http://www.example.com?a=you're_dangerous", '*'), "http://www.example.com?a=you%27re_dangerous", "escape query string values");
+#
+#  normalise url
+#
+is(normalise_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-CSV-harder-to-%27awk%27', "commas and quotes");

data/spec/c14n_spec.rb ADDED Viewed

@@ -0,0 +1,163 @@
+# encoding: utf-8
+require 'spec_helper'
+describe "Paul's tests, translated from Perl" do
+  it 'lowercases URLs' do
+    BLURI('http://www.EXAMPLE.COM/Foo/Bar/BAZ').canonicalize!.to_s.should == 'http://www.example.com/foo/bar/baz'
+  end
+  describe 'protocol' do
+    it 'translates protocol to http', reason: 'Reduces our input space, everything public anyway' do
+      BLURI('https://www.example.com').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+  end
+  describe 'slashes' do
+    it 'drops single trailing slashes' do
+      BLURI('http://www.example.com/').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+    it 'drops multiple trailing slashes' do
+      BLURI('http://www.example.com////').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+    it 'drops multiple trailing slashes on the path' do
+      BLURI('http://www.example.com/foo///').canonicalize!.to_s.should == 'http://www.example.com/foo'
+    end
+  end
+  describe 'fragments' do
+    it 'drops fragment identifier', reason: 'They won''t be mapped, so are redundant' do
+      BLURI('http://www.example.com#foo').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+    it 'drops fragment identifier and slashes' do
+      BLURI('http://www.example.com/#foo').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+  end
+  describe 'Things to keep verbatim or encode', reason: 'http://tools.ietf.org/html/rfc3986' do
+    it 'retains colons' do
+      BLURI('http://www.example.com/:colon:').canonicalize!.to_s.should == 'http://www.example.com/:colon:'
+    end
+    it 'retains tilde' do
+      BLURI('http://www.example.com/~tilde').canonicalize!.to_s.should == 'http://www.example.com/~tilde'
+    end
+    it 'retains underscores' do
+      BLURI('http://www.example.com/_underscore_').canonicalize!.to_s.should == 'http://www.example.com/_underscore_'
+    end
+    it 'retains asterisks' do
+      BLURI('http://www.example.com/*asterisk*').canonicalize!.to_s.should == 'http://www.example.com/*asterisk*'
+    end
+    it 'retains parens' do
+      BLURI('http://www.example.com/(parens)').canonicalize!.to_s.should == 'http://www.example.com/(parens)'
+    end
+    it 'escapes square brackets' do
+      BLURI('http://www.example.com/[square-brackets]').canonicalize!.to_s.should == 'http://www.example.com/%5bsquare-brackets%5d'
+    end
+    it 'encodes commas and quotes', reason: 'They make csv harder to awk' do
+      BLURI("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'").canonicalize!.to_s.should ==
+          'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27'
+    end
+    it 'encodes square brackets and pipes', reason: "It's problematic in curl and regexes" do
+      BLURI('http://www.example.com/problematic-in-curl[]||[and-regexes]').canonicalize!.to_s.should ==
+          'http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d'
+    end
+    it 'decodes non-reserved characters (! and ~)' do
+      # My god, it's full of stars
+      BLURI('http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21').
+          canonicalize!.to_s.should == 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!'
+    end
+    it 'encodes pound signs' do
+      BLURI('https://www.example.com/pound-sign-£').canonicalize!.to_s.should == 'http://www.example.com/pound-sign-%c2%a3'
+    end
+  end
+  describe 'query strings' do
+    it 'disallows all query string params by default' do
+      BLURI('http://www.example.com?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+    it 'disallows all params when there''s a slash' do
+      BLURI('http://www.example.com/?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+    it 'disallows all params after a slash with fragid' do
+      BLURI('http://www.example.com/?q=foo#bar').canonicalize!.to_s.should == 'http://www.example.com'
+    end
+    describe 'allowing some or all query string values' do
+      it 'allows named query_string parameters' do
+        BLURI('http://www.example.com/?q=foo&r=bar').canonicalize!(allow_query: 'q').to_s.should ==
+            'http://www.example.com?q=foo'
+      end
+      it 'sorts query string values' do
+        BLURI('http://www.example.com?c=23&d=1&b=909&e=33&a=1').
+          canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
+      end
+      it 'encodes querystring values' do
+        BLURI("http://www.example.com?a=you're_dangerous").canonicalize!(allow_query: :all).to_s.should ==
+          'http://www.example.com?a=you%27re_dangerous'
+      end
+      it 'whitelists and sorts query strings' do
+        BLURI('http://www.example.com?a=1&c=3&b=2').canonicalize!(allow_query: :all).to_s.should ==
+          'http://www.example.com?a=1&b=2&c=3'
+      end
+      it 'converts matrix URI to query_string' do
+        BLURI('http://www.example.com?c=23;d=1;b=909;e=33;a=1').
+          canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
+      end
+      it 'sorts cherry-picked query string arguments' do
+        BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
+          canonicalize!(allow_query: [:topic, :item]).to_s.should == 'http://www.example.com?item=23444&topic=334499'
+      end
+      it 'ignores empty querystring values' do
+        BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
+          canonicalize!(allow_query: %w(foo bar baz)).to_s.should == 'http://www.example.com'
+      end
+      describe 'querystrings that are not an HTML-encoded thing' do
+        before do
+          @bluri = BLURI('http://some.com/a/path?foo&bar').canonicalize!(allow_query: :all)
+        end
+        it 'retains the query string' do
+          @bluri.query.should == 'bar&foo'
+        end
+        it 'has a query hash with empty elements' do
+          @bluri.query_hash['foo'].should == nil
+          @bluri.query_hash['bar'].should == nil
+        end
+        it 'renders the string properly' do
+          @bluri.query_hash.to_s.should == 'bar&foo'
+        end
+      end
+    end
+    describe 'degenerate cases' do
+      describe 'the treatment of query strings when there are query string octets that unescape to '\
+               'invalid UTF-8 sequences (we no longer treat these as failures)' do
+        it 'no longer raises exceptions when there are bad things in query values' do
+          BLURI('http://example.com/path?view=%ED').
+            canonicalize!(allow_query: :all).
+            to_s.should eql('http://example.com/path?view=%ED')
+        end
+        it 're-encodes correctly when there are bad things in query keys' do
+          BLURI('http://example.com/path?%ED=view').
+            canonicalize!(allow_query: :all).
+            to_s.should eql('http://example.com/path?%ED=view')
+        end
+      end
+      describe 'failure to canonicalize paths correctly' do
+        # see https://www.pivotaltracker.com/s/projects/860575/stories/54502932
+        subject { BLURI('http://www.voa.gov.uk/stuff/?query=thing').canonicalize!(allow_query: :all) }
+        its(:path) { should eql('/stuff') }
+        its(:query) { should eql('query=thing') }
+      end
+    end
+  end
+end

data/spec/canonicalized_urls_spec.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'spec_helper'
+describe Optic14n::CanonicalizedUrls do
+  describe 'c14nize' do
+    let(:test_urls) do
+      %w(
+        http://www.qhm.mod.uk/portsmouth/leisure/fuel
+        http://www.qhm.mod.uk/portsmouth/leisure/lntm?
+        http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view
+        http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view&id=199
+        http://unistats.direct.gov.uk/searchResults.do?pname=institutesearchresults&level3Subjects=L3.90%AC10007761%ACFIRSTDEGREE%ACFulltime%AC430%ACNo%AC60%ACYes%AC83%ACNo%ACYes
+        1234://123
+      )
+    end
+    context 'options[:allow_query] is false' do
+      subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: false) }
+      it { should be_a(Optic14n::CanonicalizedUrls) }
+      its(:seen) { should eql(6) }
+      describe 'the output set' do
+        subject(:output_set) { c14nizer.output_set }
+        its(:size) { should eql(3) }
+        describe 'the items' do
+          subject { output_set.map(&:to_s) }
+          it { should include('http://www.qhm.mod.uk/portsmouth/leisure/fuel') }
+          it { should include('http://www.qhm.mod.uk/portsmouth/leisure/lntm') }
+          it { should include('http://unistats.direct.gov.uk/searchresults.do') }
+        end
+      end
+    end
+    context 'options[:allow_query] is :all' do
+      subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: :all) }
+      describe 'the output set' do
+        subject(:output_set) { c14nizer.output_set }
+        its(:size) { should eql(5) }
+      end
+      describe 'failures' do
+        subject(:failures) { c14nizer.failures }
+        it { should be_a(Hash) }
+        it 'has our last URL and an error' do
+          e = failures[test_urls.last]
+          e.should be_an(Addressable::URI::InvalidURIError)
+        end
+      end
+    end
+  end
+end

data/spec/query_hash_spec.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'spec_helper'
+describe URI::QueryHash do
+  subject(:hash) { {}.extend URI::QueryHash }
+  its(:to_s) { should eql('') }
+  describe 'setting a value by symbol' do
+    before { hash['x'] = '1' }
+    its([:x])    { should eql('1') }
+    its(['x'])   { should eql('1') }
+    its(:to_s)   { should eql('x=1') }
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'optic14n'

data/spec/uri/query_hash_spec.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'spec_helper'
+describe URI::QueryHash do
+  describe 'non-HTML encoded query strings' do
+    subject { { 'foo' => nil, 'bar' => nil }.extend URI::QueryHash }
+    its(['foo']) { should be_nil }
+    its(['bar']) { should be_nil }
+    its(:to_s)   { should eql('foo&bar') }
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,127 @@
+--- !ruby/object:Gem::Specification
+name: optic14n
+version: !ruby/object:Gem::Version
+  version: 2.0.0
+  prerelease:
+platform: ruby
+authors:
+- Russell Garner
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-03-27 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: addressable
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.3'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.3'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: gem_publisher
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+description: Canonicalises URLs.
+email:
+- rgarner@zephyros-systems.co.uk
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- jenkins.sh
+- lib/optic14n.rb
+- lib/optic14n/canonicalized_urls.rb
+- lib/optic14n/version.rb
+- lib/tasks/measure_reduction.rake
+- lib/uri/bluri.rb
+- lib/uri/query_hash.rb
+- optic14n.gemspec
+- spec/bluri_spec.rb
+- spec/c14n.t
+- spec/c14n_spec.rb
+- spec/canonicalized_urls_spec.rb
+- spec/query_hash_spec.rb
+- spec/spec_helper.rb
+- spec/uri/query_hash_spec.rb
+homepage: ''
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: 2602697415991458495
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: 2602697415991458495
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Specifically, HTTP URLs, for a limited purpose
+test_files:
+- spec/bluri_spec.rb
+- spec/c14n.t
+- spec/c14n_spec.rb
+- spec/canonicalized_urls_spec.rb
+- spec/query_hash_spec.rb
+- spec/spec_helper.rb
+- spec/uri/query_hash_spec.rb