optic14n 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +60 -0
- data/Rakefile +19 -0
- data/jenkins.sh +10 -0
- data/lib/optic14n/canonicalized_urls.rb +45 -0
- data/lib/optic14n/version.rb +3 -0
- data/lib/optic14n.rb +10 -0
- data/lib/tasks/measure_reduction.rake +15 -0
- data/lib/uri/bluri.rb +116 -0
- data/lib/uri/query_hash.rb +33 -0
- data/optic14n.gemspec +25 -0
- data/spec/bluri_spec.rb +88 -0
- data/spec/c14n.t +71 -0
- data/spec/c14n_spec.rb +163 -0
- data/spec/canonicalized_urls_spec.rb +59 -0
- data/spec/query_hash_spec.rb +15 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/uri/query_hash_spec.rb +11 -0
- metadata +127 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Government Digital Service
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# Optic14n
|
2
|
+
|
3
|
+
Canonicalises URLs.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'optic14n'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install optic14n
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Parse a `BLURI` like this:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
bluri = BLURI('http://somewhere.com/?a=1&b=2&c=3')
|
25
|
+
```
|
26
|
+
|
27
|
+
Canonicalize it according to the [Previously-Established Rules](#the-previously-established-rules) thusly:
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
bluri.canonicalize!
|
31
|
+
```
|
32
|
+
|
33
|
+
You can also do site-specific stuff if you know some of the querystring will be valuable
|
34
|
+
```ruby
|
35
|
+
bluri.canonicalize!(allow_query: :all)
|
36
|
+
```
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
bluri.canonicalize!(allow_query: [:a, :c])
|
40
|
+
# or
|
41
|
+
bluri.canonicalize!(allow_query: ['a', 'c'])
|
42
|
+
```
|
43
|
+
|
44
|
+
### The previously-established rules
|
45
|
+
|
46
|
+
This is a gem for canonicalising HTTP URIs such that we can boil our input set of URIs down to something that is much
|
47
|
+
smaller than it would otherwise be. We do this aggressively by:
|
48
|
+
|
49
|
+
* lowercasing URIs
|
50
|
+
* removing query strings (unless told otherwise)
|
51
|
+
* removing fragments
|
52
|
+
* escaping and unescaping various characters and escape sequences according to RFC3986
|
53
|
+
|
54
|
+
## Contributing
|
55
|
+
|
56
|
+
1. Fork it
|
57
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
58
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
59
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
60
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require 'optic14n'
|
6
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r }
|
7
|
+
|
8
|
+
|
9
|
+
require 'gem_publisher'
|
10
|
+
desc 'Publish gem to Rubygems'
|
11
|
+
task :publish_gem do
|
12
|
+
gem = GemPublisher.publish_if_updated('optic14n.gemspec', :rubygems)
|
13
|
+
puts "Published #{gem}" if gem
|
14
|
+
end
|
15
|
+
|
16
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
|
+
|
18
|
+
task default: :spec
|
19
|
+
task test: :spec
|
data/jenkins.sh
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module Optic14n
|
2
|
+
##
|
3
|
+
# Canonicalizes a set of URLs
|
4
|
+
class CanonicalizedUrls
|
5
|
+
attr_reader :output_set, :seen, :failures, :each
|
6
|
+
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@output_set, :size
|
10
|
+
|
11
|
+
def initialize(urls, options)
|
12
|
+
@urls = urls
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def canonicalize!
|
17
|
+
@seen = 0
|
18
|
+
@failures = {}
|
19
|
+
@output_set = Set.new
|
20
|
+
|
21
|
+
@urls.each do |url|
|
22
|
+
begin
|
23
|
+
@output_set.add(BLURI(url).canonicalize!(@options))
|
24
|
+
rescue Exception => e
|
25
|
+
failures[url] = e
|
26
|
+
end
|
27
|
+
@seen += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def write(filename)
|
32
|
+
File.open(filename, 'w') do |file|
|
33
|
+
@output_set.each do |url|
|
34
|
+
file.puts url
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Canonicalize given urls. +options+ will be passed to +BLURI.parse+
|
41
|
+
def self.from_urls(urls, options = {})
|
42
|
+
CanonicalizedUrls.new(urls, options).tap { |c| c.canonicalize! }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/optic14n.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
namespace :opt do
|
4
|
+
desc 'Measure reduction from canonicalisation'
|
5
|
+
task :measure, [:filename, :output_file] do |_, args|
|
6
|
+
filename = args[:filename]
|
7
|
+
output_file = args[:output_file]
|
8
|
+
|
9
|
+
Optic14n::CanonicalizedUrls.from_urls(File.read(filename).each_line).tap do |urls|
|
10
|
+
urls.write(output_file) if output_file
|
11
|
+
|
12
|
+
puts "#{urls.seen} urls seen, #{urls.size} after canonicalisation"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/uri/bluri.rb
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module URI
|
4
|
+
##
|
5
|
+
# A URI class with a bit extra for canonicalising query strings
|
6
|
+
#
|
7
|
+
class BLURI < URI::HTTP
|
8
|
+
PATH_ESCAPE_MAPPINGS = {
|
9
|
+
'[' => '%5b',
|
10
|
+
']' => '%5d',
|
11
|
+
',' => '%2c',
|
12
|
+
'"' => '%22',
|
13
|
+
"'" => '%27',
|
14
|
+
'|' => '%7c',
|
15
|
+
'!' => '%21',
|
16
|
+
'£' => '%c2%a3'
|
17
|
+
}
|
18
|
+
|
19
|
+
PATH_UNESCAPE_MAPPINGS = {
|
20
|
+
'%7e' => '~',
|
21
|
+
'%21' => '!'
|
22
|
+
}
|
23
|
+
|
24
|
+
REQUIRE_REGEX_ESCAPE = %w<. | ( ) [ ] { } + \ ^ $ * ?> & PATH_ESCAPE_MAPPINGS.keys
|
25
|
+
|
26
|
+
extend Forwardable
|
27
|
+
|
28
|
+
def_delegators :@uri, :scheme, :path, :host, :host=, :query, :fragment, :to_s
|
29
|
+
|
30
|
+
def initialize(uri_str)
|
31
|
+
@uri = ::Addressable::URI.parse(uri_str)
|
32
|
+
raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri
|
33
|
+
end
|
34
|
+
|
35
|
+
def query_hash
|
36
|
+
@query_hash ||= CGI::parse(self.query || '').tap do |query_hash|
|
37
|
+
# By default, CGI::parse produces lots of arrays. Usually they have a single element
|
38
|
+
# in them. That's correct but not terribly usable. Fix it here.
|
39
|
+
query_hash.each_pair { |k, v| query_hash[k] = v[0] if v.length == 1 }
|
40
|
+
query_hash.extend QueryHash
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def query_hash=(value)
|
45
|
+
@query_hash = value
|
46
|
+
@uri.query = @query_hash.to_s == '' ? nil : @query_hash.to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
def query=(query_str)
|
50
|
+
@query_hash = nil
|
51
|
+
@uri.query = query_str == '' ? nil : query_str
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.parse(uri_str)
|
55
|
+
# Deal with known URI spec breaks - leading/trailing spaces and unencoded entities
|
56
|
+
if uri_str.is_a? String
|
57
|
+
uri_str = uri_str.strip.downcase.gsub(' ', '%20')
|
58
|
+
uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/
|
59
|
+
end
|
60
|
+
BLURI.new(uri_str)
|
61
|
+
end
|
62
|
+
|
63
|
+
def has_query?
|
64
|
+
%w(http https).include?(@uri.scheme) && query
|
65
|
+
end
|
66
|
+
|
67
|
+
def canonicalize!(options = {})
|
68
|
+
@uri.scheme = 'http' if @uri.scheme == 'https'
|
69
|
+
|
70
|
+
@uri.path = @uri.path.sub(/\/*$/, '') if @uri.path =~ /^*\/$/
|
71
|
+
@uri.path.gsub!(BLURI.path_escape_char_regex, PATH_ESCAPE_MAPPINGS)
|
72
|
+
@uri.path.gsub!(BLURI.path_unescape_code_regex, PATH_UNESCAPE_MAPPINGS)
|
73
|
+
|
74
|
+
canonicalize_query!(options)
|
75
|
+
|
76
|
+
@uri.fragment = nil
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def canonicalize_query!(options)
|
81
|
+
allow_all = (options[:allow_query] == :all)
|
82
|
+
allowed_keys = [options[:allow_query]].flatten.compact unless allow_all
|
83
|
+
|
84
|
+
query_hash.keep_if do |k, _|
|
85
|
+
allow_all || (allowed_keys.include?(k) || allowed_keys.include?(k.to_sym))
|
86
|
+
end
|
87
|
+
|
88
|
+
self.query_hash = QueryHash[query_hash.sort_by { |k, _| k }]
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# Generate a regex which matches all characters in PATH_ESCAPE_MAPPINGS
|
93
|
+
def self.path_escape_char_regex
|
94
|
+
@path_escape_char_regex ||=
|
95
|
+
Regexp.new('[' + PATH_ESCAPE_MAPPINGS.keys.map do |char|
|
96
|
+
REQUIRE_REGEX_ESCAPE.include?(char) ? "\\#{char}" : char
|
97
|
+
end.join + ']')
|
98
|
+
end
|
99
|
+
|
100
|
+
##
|
101
|
+
# Generate a regex which matches all escape sequences in PATH_UNESCAPE_MAPPINGS
|
102
|
+
def self.path_unescape_code_regex
|
103
|
+
@path_unescape_code_regex ||= Regexp.new(
|
104
|
+
PATH_UNESCAPE_MAPPINGS.keys.map { |code| "(?:#{code})" }.join('|')
|
105
|
+
)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
module Kernel
|
111
|
+
def BLURI(uri_str)
|
112
|
+
::URI::BLURI.parse(uri_str)
|
113
|
+
end
|
114
|
+
|
115
|
+
module_function :BLURI
|
116
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module URI
|
2
|
+
##
|
3
|
+
# Extends a hash with query string rendering/semi-indifferent access
|
4
|
+
module QueryHash
|
5
|
+
def [](key)
|
6
|
+
item = super key
|
7
|
+
item = super(key.to_s) if item.nil? || item.length == 0
|
8
|
+
item.class == Array && item.length == 0 ? nil : item
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
keys.map { |key| render_value(key, self[key]) }.join('&')
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Creates a new hash populated with the given objects.
|
17
|
+
def self.[](value)
|
18
|
+
Hash[value].tap do |hash|
|
19
|
+
hash.extend(QueryHash)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def render_value(key, value)
|
26
|
+
case value
|
27
|
+
when nil then key
|
28
|
+
when Array then value.map { |el| render_value(key, el) }.join('&')
|
29
|
+
else URI.encode_www_form_component(key) << '=' << URI.encode_www_form_component(value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/optic14n.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'optic14n/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'optic14n'
|
8
|
+
spec.version = Optic14n::VERSION
|
9
|
+
spec.authors = ['Russell Garner']
|
10
|
+
spec.email = %w(rgarner@zephyros-systems.co.uk)
|
11
|
+
spec.description = %q{Canonicalises URLs.}
|
12
|
+
spec.summary = %q{Specifically, HTTP URLs, for a limited purpose}
|
13
|
+
spec.homepage = ''
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = %w(lib)
|
20
|
+
|
21
|
+
spec.add_dependency 'addressable', '~> 2.3'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_development_dependency 'gem_publisher', '~> 1.3.0'
|
25
|
+
end
|
data/spec/bluri_spec.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe URI::BLURI do
|
4
|
+
it 'should be an HTTP URI' do
|
5
|
+
bluri = BLURI('http://some.where.com')
|
6
|
+
bluri.should be_a URI::HTTP
|
7
|
+
end
|
8
|
+
|
9
|
+
it 'should not allow other schemes' do
|
10
|
+
lambda { BLURI('ftp://foo').should raise_error(ArgumentError) }
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should not allow nil' do
|
14
|
+
lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'supports scheme' do
|
18
|
+
BLURI('http://foo').scheme.should == 'http'
|
19
|
+
end
|
20
|
+
it 'supports host' do
|
21
|
+
BLURI('http://foo').host.should == 'foo'
|
22
|
+
end
|
23
|
+
it 'supports path' do
|
24
|
+
BLURI('http://foo/a/path').path.should == '/a/path'
|
25
|
+
end
|
26
|
+
it 'supports query' do
|
27
|
+
BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo'
|
28
|
+
end
|
29
|
+
it 'supports fragment' do
|
30
|
+
BLURI('http://foo#fragment').fragment.should == 'fragment'
|
31
|
+
end
|
32
|
+
it 'supports mailto:someone@somewhere' do
|
33
|
+
BLURI('mailto:me@there.com').to_s.should == 'mailto:me@there.com'
|
34
|
+
end
|
35
|
+
it 'corrects unencoded ampersands ins mailto' do # http://www.faqs.org/rfcs/rfc2368.html
|
36
|
+
BLURI('mailto:fruit&veg.newcastle@rpa.gsi.gov.uk').to_s.should == 'mailto:fruit%26veg.newcastle@rpa.gsi.gov.uk'
|
37
|
+
end
|
38
|
+
it 'corrects trailing spaces' do
|
39
|
+
BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk'
|
40
|
+
end
|
41
|
+
it 'corrects leading spaces' do
|
42
|
+
BLURI(' http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk'
|
43
|
+
end
|
44
|
+
|
45
|
+
describe 'Query string parsing' do
|
46
|
+
context 'the query string is of HTML-encoded form k=v&q=p' do
|
47
|
+
before do
|
48
|
+
@bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'indexes the query string' do
|
52
|
+
@bluri.query_hash['itemid'].should == '1'
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'allows indexing by symbol' do
|
56
|
+
@bluri.query_hash[:itemid].should == '1'
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'shows nil for absent items' do
|
60
|
+
@bluri.query_hash[:eerie_flash].should == nil
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'indexes the second query string item' do
|
64
|
+
@bluri.query_hash['type'].should == 'resource'
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'allows setting of the query' do
|
68
|
+
@bluri.query = 'furry=really'
|
69
|
+
@bluri.to_s.should == 'http://some.com/a/path?furry=really'
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context 'the querystring is not an HTML-encoded thing' do
|
74
|
+
before do
|
75
|
+
@bluri = BLURI('http://some.com/a/path?foo&bar')
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'retains the query string' do
|
79
|
+
@bluri.query.should == 'foo&bar'
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'has a query hash with empty elements' do
|
83
|
+
@bluri.query_hash['foo'].should == nil
|
84
|
+
@bluri.query_hash['foo'].should == nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
data/spec/c14n.t
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# Here for reference, see original at
|
2
|
+
# https://github.com/alphagov/redirector/blob/master/tests/lib/c14n.t
|
3
|
+
|
4
|
+
use strict;
|
5
|
+
use Test::More;
|
6
|
+
require 'lib/c14n.pl';
|
7
|
+
|
8
|
+
#
|
9
|
+
# case
|
10
|
+
#
|
11
|
+
is(c14n_url("http://www.EXAMPLE.COM/Foo/Bar/BAZ"), "http://www.example.com/foo/bar/baz", "c14n URL is lower-case");
|
12
|
+
|
13
|
+
#
|
14
|
+
# protocol
|
15
|
+
#
|
16
|
+
is(c14n_url("https://www.example.com"), "http://www.example.com", "translates protocol to http");
|
17
|
+
|
18
|
+
#
|
19
|
+
# slashes
|
20
|
+
#
|
21
|
+
is(c14n_url("http://www.example.com/"), "http://www.example.com", "drops trailing slash");
|
22
|
+
is(c14n_url("http://www.example.com////"), "http://www.example.com", "drops multiple trailing slashes");
|
23
|
+
|
24
|
+
#
|
25
|
+
# fragment identifier
|
26
|
+
#
|
27
|
+
is(c14n_url("http://www.example.com#foo"), "http://www.example.com", "drops fragment identifier");
|
28
|
+
is(c14n_url("http://www.example.com/#foo"), "http://www.example.com", "drops fragment identifier and slashes");
|
29
|
+
|
30
|
+
#
|
31
|
+
# encoding
|
32
|
+
#
|
33
|
+
is(c14n_url("http://www.example.com/:colon:"), "http://www.example.com/:colon:", "colons");
|
34
|
+
is(c14n_url("http://www.example.com/~tide"), "http://www.example.com/~tide", "tide");
|
35
|
+
is(c14n_url("http://www.example.com/_underscore_"), "http://www.example.com/_underscore_", "underscore");
|
36
|
+
is(c14n_url("http://www.example.com/*asterisk*"), "http://www.example.com/*asterisk*", "asterisk");
|
37
|
+
is(c14n_url("http://www.example.com/(parens)"), "http://www.example.com/(parens)", "parens");
|
38
|
+
is(c14n_url("http://www.example.com/[square-brackets]"), "http://www.example.com/%5bsquare-brackets%5d", "square-brackets");
|
39
|
+
|
40
|
+
is(c14n_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27', "commas and quotes");
|
41
|
+
is(c14n_url("http://www.example.com/problematic-in-curl[]||[and-regexes]"), "http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d", "square brackets and pipes");
|
42
|
+
is(c14n_url("http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21"),
|
43
|
+
'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!',
|
44
|
+
"non-reserved character percent decoding");
|
45
|
+
|
46
|
+
is(c14n_url("https://www.example.com/pound-sign-£"), "http://www.example.com/pound-sign-%c2%a3", "pound sign");
|
47
|
+
|
48
|
+
#
|
49
|
+
# query_strings
|
50
|
+
#
|
51
|
+
is(c14n_url("http://www.example.com?q=foo"), "http://www.example.com", "drops disallowed query-string");
|
52
|
+
is(c14n_url("http://www.example.com/?q=foo"), "http://www.example.com", "drops disallowed query-string after slash");
|
53
|
+
is(c14n_url("http://www.example.com/?q=foo#bar"), "http://www.example.com", "drops disallowed query-string after a slash with fragid");
|
54
|
+
|
55
|
+
is(c14n_url("http://www.example.com?a=1&c=3&b=2", '*'), "http://www.example.com?a=1&b=2&c=3", "query string wildcard value");
|
56
|
+
|
57
|
+
is(c14n_url("http://www.example.com/?q=foo", "q"), "http://www.example.com?q=foo", "allow named query_string parameter");
|
58
|
+
|
59
|
+
is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "sorts query_string values");
|
60
|
+
is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", " b e,c:d, a "), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "accept colon and space separated allowed values");
|
61
|
+
is(c14n_url("http://www.example.com?c=23;d=1;b=909;e=33;a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "converts matrix URI to query_string");
|
62
|
+
|
63
|
+
is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "topic,item"), "http://www.example.com?item=23444&topic=334499", "allows cherry-picked query_string");
|
64
|
+
is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "foo,bar,baz"), "http://www.example.com", "no ? for empty query_string values");
|
65
|
+
|
66
|
+
is(c14n_url("http://www.example.com?a=you're_dangerous", '*'), "http://www.example.com?a=you%27re_dangerous", "escape query string values");
|
67
|
+
|
68
|
+
#
|
69
|
+
# normalise url
|
70
|
+
#
|
71
|
+
is(normalise_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-CSV-harder-to-%27awk%27', "commas and quotes");
|
data/spec/c14n_spec.rb
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe "Paul's tests, translated from Perl" do
|
6
|
+
it 'lowercases URLs' do
|
7
|
+
BLURI('http://www.EXAMPLE.COM/Foo/Bar/BAZ').canonicalize!.to_s.should == 'http://www.example.com/foo/bar/baz'
|
8
|
+
end
|
9
|
+
|
10
|
+
describe 'protocol' do
|
11
|
+
it 'translates protocol to http', reason: 'Reduces our input space, everything public anyway' do
|
12
|
+
BLURI('https://www.example.com').canonicalize!.to_s.should == 'http://www.example.com'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'slashes' do
|
17
|
+
it 'drops single trailing slashes' do
|
18
|
+
BLURI('http://www.example.com/').canonicalize!.to_s.should == 'http://www.example.com'
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'drops multiple trailing slashes' do
|
22
|
+
BLURI('http://www.example.com////').canonicalize!.to_s.should == 'http://www.example.com'
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'drops multiple trailing slashes on the path' do
|
26
|
+
BLURI('http://www.example.com/foo///').canonicalize!.to_s.should == 'http://www.example.com/foo'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'fragments' do
|
31
|
+
it 'drops fragment identifier', reason: 'They won''t be mapped, so are redundant' do
|
32
|
+
BLURI('http://www.example.com#foo').canonicalize!.to_s.should == 'http://www.example.com'
|
33
|
+
end
|
34
|
+
it 'drops fragment identifier and slashes' do
|
35
|
+
BLURI('http://www.example.com/#foo').canonicalize!.to_s.should == 'http://www.example.com'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'Things to keep verbatim or encode', reason: 'http://tools.ietf.org/html/rfc3986' do
|
40
|
+
it 'retains colons' do
|
41
|
+
BLURI('http://www.example.com/:colon:').canonicalize!.to_s.should == 'http://www.example.com/:colon:'
|
42
|
+
end
|
43
|
+
it 'retains tilde' do
|
44
|
+
BLURI('http://www.example.com/~tilde').canonicalize!.to_s.should == 'http://www.example.com/~tilde'
|
45
|
+
end
|
46
|
+
it 'retains underscores' do
|
47
|
+
BLURI('http://www.example.com/_underscore_').canonicalize!.to_s.should == 'http://www.example.com/_underscore_'
|
48
|
+
end
|
49
|
+
it 'retains asterisks' do
|
50
|
+
BLURI('http://www.example.com/*asterisk*').canonicalize!.to_s.should == 'http://www.example.com/*asterisk*'
|
51
|
+
end
|
52
|
+
it 'retains parens' do
|
53
|
+
BLURI('http://www.example.com/(parens)').canonicalize!.to_s.should == 'http://www.example.com/(parens)'
|
54
|
+
end
|
55
|
+
it 'escapes square brackets' do
|
56
|
+
BLURI('http://www.example.com/[square-brackets]').canonicalize!.to_s.should == 'http://www.example.com/%5bsquare-brackets%5d'
|
57
|
+
end
|
58
|
+
it 'encodes commas and quotes', reason: 'They make csv harder to awk' do
|
59
|
+
BLURI("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'").canonicalize!.to_s.should ==
|
60
|
+
'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27'
|
61
|
+
end
|
62
|
+
it 'encodes square brackets and pipes', reason: "It's problematic in curl and regexes" do
|
63
|
+
BLURI('http://www.example.com/problematic-in-curl[]||[and-regexes]').canonicalize!.to_s.should ==
|
64
|
+
'http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d'
|
65
|
+
end
|
66
|
+
it 'decodes non-reserved characters (! and ~)' do
|
67
|
+
# My god, it's full of stars
|
68
|
+
BLURI('http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21').
|
69
|
+
canonicalize!.to_s.should == 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!'
|
70
|
+
end
|
71
|
+
it 'encodes pound signs' do
|
72
|
+
BLURI('https://www.example.com/pound-sign-£').canonicalize!.to_s.should == 'http://www.example.com/pound-sign-%c2%a3'
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
describe 'query strings' do
|
77
|
+
it 'disallows all query string params by default' do
|
78
|
+
BLURI('http://www.example.com?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
|
79
|
+
end
|
80
|
+
it 'disallows all params when there''s a slash' do
|
81
|
+
BLURI('http://www.example.com/?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
|
82
|
+
end
|
83
|
+
it 'disallows all params after a slash with fragid' do
|
84
|
+
BLURI('http://www.example.com/?q=foo#bar').canonicalize!.to_s.should == 'http://www.example.com'
|
85
|
+
end
|
86
|
+
|
87
|
+
describe 'allowing some or all query string values' do
|
88
|
+
it 'allows named query_string parameters' do
|
89
|
+
BLURI('http://www.example.com/?q=foo&r=bar').canonicalize!(allow_query: 'q').to_s.should ==
|
90
|
+
'http://www.example.com?q=foo'
|
91
|
+
end
|
92
|
+
it 'sorts query string values' do
|
93
|
+
BLURI('http://www.example.com?c=23&d=1&b=909&e=33&a=1').
|
94
|
+
canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
|
95
|
+
end
|
96
|
+
it 'encodes querystring values' do
|
97
|
+
BLURI("http://www.example.com?a=you're_dangerous").canonicalize!(allow_query: :all).to_s.should ==
|
98
|
+
'http://www.example.com?a=you%27re_dangerous'
|
99
|
+
end
|
100
|
+
it 'whitelists and sorts query strings' do
|
101
|
+
BLURI('http://www.example.com?a=1&c=3&b=2').canonicalize!(allow_query: :all).to_s.should ==
|
102
|
+
'http://www.example.com?a=1&b=2&c=3'
|
103
|
+
end
|
104
|
+
it 'converts matrix URI to query_string' do
|
105
|
+
BLURI('http://www.example.com?c=23;d=1;b=909;e=33;a=1').
|
106
|
+
canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
|
107
|
+
end
|
108
|
+
it 'sorts cherry-picked query string arguments' do
|
109
|
+
BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
|
110
|
+
canonicalize!(allow_query: [:topic, :item]).to_s.should == 'http://www.example.com?item=23444&topic=334499'
|
111
|
+
end
|
112
|
+
it 'ignores empty querystring values' do
|
113
|
+
BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
|
114
|
+
canonicalize!(allow_query: %w(foo bar baz)).to_s.should == 'http://www.example.com'
|
115
|
+
end
|
116
|
+
|
117
|
+
describe 'querystrings that are not an HTML-encoded thing' do
|
118
|
+
before do
|
119
|
+
@bluri = BLURI('http://some.com/a/path?foo&bar').canonicalize!(allow_query: :all)
|
120
|
+
end
|
121
|
+
|
122
|
+
it 'retains the query string' do
|
123
|
+
@bluri.query.should == 'bar&foo'
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'has a query hash with empty elements' do
|
127
|
+
@bluri.query_hash['foo'].should == nil
|
128
|
+
@bluri.query_hash['bar'].should == nil
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'renders the string properly' do
|
132
|
+
@bluri.query_hash.to_s.should == 'bar&foo'
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
describe 'degenerate cases' do
|
138
|
+
describe 'the treatment of query strings when there are query string octets that unescape to '\
|
139
|
+
'invalid UTF-8 sequences (we no longer treat these as failures)' do
|
140
|
+
it 'no longer raises exceptions when there are bad things in query values' do
|
141
|
+
BLURI('http://example.com/path?view=%ED').
|
142
|
+
canonicalize!(allow_query: :all).
|
143
|
+
to_s.should eql('http://example.com/path?view=%ED')
|
144
|
+
end
|
145
|
+
|
146
|
+
it 're-encodes correctly when there are bad things in query keys' do
|
147
|
+
BLURI('http://example.com/path?%ED=view').
|
148
|
+
canonicalize!(allow_query: :all).
|
149
|
+
to_s.should eql('http://example.com/path?%ED=view')
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
describe 'failure to canonicalize paths correctly' do
|
154
|
+
# see https://www.pivotaltracker.com/s/projects/860575/stories/54502932
|
155
|
+
|
156
|
+
subject { BLURI('http://www.voa.gov.uk/stuff/?query=thing').canonicalize!(allow_query: :all) }
|
157
|
+
|
158
|
+
its(:path) { should eql('/stuff') }
|
159
|
+
its(:query) { should eql('query=thing') }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Optic14n::CanonicalizedUrls do
|
4
|
+
describe 'c14nize' do
|
5
|
+
let(:test_urls) do
|
6
|
+
%w(
|
7
|
+
http://www.qhm.mod.uk/portsmouth/leisure/fuel
|
8
|
+
http://www.qhm.mod.uk/portsmouth/leisure/lntm?
|
9
|
+
http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view
|
10
|
+
http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view&id=199
|
11
|
+
http://unistats.direct.gov.uk/searchResults.do?pname=institutesearchresults&level3Subjects=L3.90%AC10007761%ACFIRSTDEGREE%ACFulltime%AC430%ACNo%AC60%ACYes%AC83%ACNo%ACYes
|
12
|
+
1234://123
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
context 'options[:allow_query] is false' do
|
17
|
+
subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: false) }
|
18
|
+
|
19
|
+
it { should be_a(Optic14n::CanonicalizedUrls) }
|
20
|
+
|
21
|
+
its(:seen) { should eql(6) }
|
22
|
+
|
23
|
+
describe 'the output set' do
|
24
|
+
subject(:output_set) { c14nizer.output_set }
|
25
|
+
|
26
|
+
its(:size) { should eql(3) }
|
27
|
+
|
28
|
+
describe 'the items' do
|
29
|
+
subject { output_set.map(&:to_s) }
|
30
|
+
|
31
|
+
it { should include('http://www.qhm.mod.uk/portsmouth/leisure/fuel') }
|
32
|
+
it { should include('http://www.qhm.mod.uk/portsmouth/leisure/lntm') }
|
33
|
+
it { should include('http://unistats.direct.gov.uk/searchresults.do') }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context 'options[:allow_query] is :all' do
|
39
|
+
subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: :all) }
|
40
|
+
|
41
|
+
describe 'the output set' do
|
42
|
+
subject(:output_set) { c14nizer.output_set }
|
43
|
+
|
44
|
+
its(:size) { should eql(5) }
|
45
|
+
end
|
46
|
+
|
47
|
+
describe 'failures' do
|
48
|
+
subject(:failures) { c14nizer.failures }
|
49
|
+
|
50
|
+
it { should be_a(Hash) }
|
51
|
+
|
52
|
+
it 'has our last URL and an error' do
|
53
|
+
e = failures[test_urls.last]
|
54
|
+
e.should be_an(Addressable::URI::InvalidURIError)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe URI::QueryHash do
|
4
|
+
subject(:hash) { {}.extend URI::QueryHash }
|
5
|
+
|
6
|
+
its(:to_s) { should eql('') }
|
7
|
+
|
8
|
+
describe 'setting a value by symbol' do
|
9
|
+
before { hash['x'] = '1' }
|
10
|
+
|
11
|
+
its([:x]) { should eql('1') }
|
12
|
+
its(['x']) { should eql('1') }
|
13
|
+
its(:to_s) { should eql('x=1') }
|
14
|
+
end
|
15
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'optic14n'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe URI::QueryHash do
|
4
|
+
describe 'non-HTML encoded query strings' do
|
5
|
+
subject { { 'foo' => nil, 'bar' => nil }.extend URI::QueryHash }
|
6
|
+
|
7
|
+
its(['foo']) { should be_nil }
|
8
|
+
its(['bar']) { should be_nil }
|
9
|
+
its(:to_s) { should eql('foo&bar') }
|
10
|
+
end
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: optic14n
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Russell Garner
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: addressable
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.3'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: gem_publisher
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.3.0
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.3.0
|
62
|
+
description: Canonicalises URLs.
|
63
|
+
email:
|
64
|
+
- rgarner@zephyros-systems.co.uk
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- Gemfile
|
71
|
+
- LICENSE.txt
|
72
|
+
- README.md
|
73
|
+
- Rakefile
|
74
|
+
- jenkins.sh
|
75
|
+
- lib/optic14n.rb
|
76
|
+
- lib/optic14n/canonicalized_urls.rb
|
77
|
+
- lib/optic14n/version.rb
|
78
|
+
- lib/tasks/measure_reduction.rake
|
79
|
+
- lib/uri/bluri.rb
|
80
|
+
- lib/uri/query_hash.rb
|
81
|
+
- optic14n.gemspec
|
82
|
+
- spec/bluri_spec.rb
|
83
|
+
- spec/c14n.t
|
84
|
+
- spec/c14n_spec.rb
|
85
|
+
- spec/canonicalized_urls_spec.rb
|
86
|
+
- spec/query_hash_spec.rb
|
87
|
+
- spec/spec_helper.rb
|
88
|
+
- spec/uri/query_hash_spec.rb
|
89
|
+
homepage: ''
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
hash: 2602697415991458495
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
hash: 2602697415991458495
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.23
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: Specifically, HTTP URLs, for a limited purpose
|
120
|
+
test_files:
|
121
|
+
- spec/bluri_spec.rb
|
122
|
+
- spec/c14n.t
|
123
|
+
- spec/c14n_spec.rb
|
124
|
+
- spec/canonicalized_urls_spec.rb
|
125
|
+
- spec/query_hash_spec.rb
|
126
|
+
- spec/spec_helper.rb
|
127
|
+
- spec/uri/query_hash_spec.rb
|