optic14n 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +60 -0
- data/Rakefile +19 -0
- data/jenkins.sh +10 -0
- data/lib/optic14n/canonicalized_urls.rb +45 -0
- data/lib/optic14n/version.rb +3 -0
- data/lib/optic14n.rb +10 -0
- data/lib/tasks/measure_reduction.rake +15 -0
- data/lib/uri/bluri.rb +116 -0
- data/lib/uri/query_hash.rb +33 -0
- data/optic14n.gemspec +25 -0
- data/spec/bluri_spec.rb +88 -0
- data/spec/c14n.t +71 -0
- data/spec/c14n_spec.rb +163 -0
- data/spec/canonicalized_urls_spec.rb +59 -0
- data/spec/query_hash_spec.rb +15 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/uri/query_hash_spec.rb +11 -0
- metadata +127 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Government Digital Service
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
# Optic14n
|
2
|
+
|
3
|
+
Canonicalises URLs.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'optic14n'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install optic14n
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Parse a `BLURI` like this:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
bluri = BLURI('http://somewhere.com/?a=1&b=2&c=3')
|
25
|
+
```
|
26
|
+
|
27
|
+
Canonicalize it according to the [Previously-Established Rules](#the-previously-established-rules) thusly:
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
bluri.canonicalize!
|
31
|
+
```
|
32
|
+
|
33
|
+
You can also do site-specific stuff if you know some of the querystring will be valuable
|
34
|
+
```ruby
|
35
|
+
bluri.canonicalize!(allow_query: :all)
|
36
|
+
```
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
bluri.canonicalize!(allow_query: [:a, :c])
|
40
|
+
# or
|
41
|
+
bluri.canonicalize!(allow_query: ['a', 'c'])
|
42
|
+
```
|
43
|
+
|
44
|
+
### The previously-established rules
|
45
|
+
|
46
|
+
This is a gem for canonicalising HTTP URIs such that we can boil our input set of URIs down to something that is much
|
47
|
+
smaller than it would otherwise be. We do this aggressively by:
|
48
|
+
|
49
|
+
* lowercasing URIs
|
50
|
+
* removing query strings (unless told otherwise)
|
51
|
+
* removing fragments
|
52
|
+
* escaping and unescaping various characters and escape sequences according to RFC3986
|
53
|
+
|
54
|
+
## Contributing
|
55
|
+
|
56
|
+
1. Fork it
|
57
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
58
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
59
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
60
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require 'optic14n'
|
6
|
+
Dir.glob('lib/tasks/*.rake').each { |r| import r }
|
7
|
+
|
8
|
+
|
9
|
+
require 'gem_publisher'
|
10
|
+
desc 'Publish gem to Rubygems'
|
11
|
+
task :publish_gem do
|
12
|
+
gem = GemPublisher.publish_if_updated('optic14n.gemspec', :rubygems)
|
13
|
+
puts "Published #{gem}" if gem
|
14
|
+
end
|
15
|
+
|
16
|
+
RSpec::Core::RakeTask.new(:spec)
|
17
|
+
|
18
|
+
task default: :spec
|
19
|
+
task test: :spec
|
data/jenkins.sh
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module Optic14n
|
2
|
+
##
|
3
|
+
# Canonicalizes a set of URLs
|
4
|
+
class CanonicalizedUrls
|
5
|
+
attr_reader :output_set, :seen, :failures, :each
|
6
|
+
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@output_set, :size
|
10
|
+
|
11
|
+
def initialize(urls, options)
|
12
|
+
@urls = urls
|
13
|
+
@options = options
|
14
|
+
end
|
15
|
+
|
16
|
+
def canonicalize!
|
17
|
+
@seen = 0
|
18
|
+
@failures = {}
|
19
|
+
@output_set = Set.new
|
20
|
+
|
21
|
+
@urls.each do |url|
|
22
|
+
begin
|
23
|
+
@output_set.add(BLURI(url).canonicalize!(@options))
|
24
|
+
rescue Exception => e
|
25
|
+
failures[url] = e
|
26
|
+
end
|
27
|
+
@seen += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def write(filename)
|
32
|
+
File.open(filename, 'w') do |file|
|
33
|
+
@output_set.each do |url|
|
34
|
+
file.puts url
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
##
|
40
|
+
# Canonicalize given urls. +options+ will be passed to +BLURI.parse+
|
41
|
+
def self.from_urls(urls, options = {})
|
42
|
+
CanonicalizedUrls.new(urls, options).tap { |c| c.canonicalize! }
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/lib/optic14n.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
namespace :opt do
|
4
|
+
desc 'Measure reduction from canonicalisation'
|
5
|
+
task :measure, [:filename, :output_file] do |_, args|
|
6
|
+
filename = args[:filename]
|
7
|
+
output_file = args[:output_file]
|
8
|
+
|
9
|
+
Optic14n::CanonicalizedUrls.from_urls(File.read(filename).each_line).tap do |urls|
|
10
|
+
urls.write(output_file) if output_file
|
11
|
+
|
12
|
+
puts "#{urls.seen} urls seen, #{urls.size} after canonicalisation"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/uri/bluri.rb
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module URI
|
4
|
+
##
|
5
|
+
# A URI class with a bit extra for canonicalising query strings
|
6
|
+
#
|
7
|
+
class BLURI < URI::HTTP
|
8
|
+
PATH_ESCAPE_MAPPINGS = {
|
9
|
+
'[' => '%5b',
|
10
|
+
']' => '%5d',
|
11
|
+
',' => '%2c',
|
12
|
+
'"' => '%22',
|
13
|
+
"'" => '%27',
|
14
|
+
'|' => '%7c',
|
15
|
+
'!' => '%21',
|
16
|
+
'£' => '%c2%a3'
|
17
|
+
}
|
18
|
+
|
19
|
+
PATH_UNESCAPE_MAPPINGS = {
|
20
|
+
'%7e' => '~',
|
21
|
+
'%21' => '!'
|
22
|
+
}
|
23
|
+
|
24
|
+
REQUIRE_REGEX_ESCAPE = %w<. | ( ) [ ] { } + \ ^ $ * ?> & PATH_ESCAPE_MAPPINGS.keys
|
25
|
+
|
26
|
+
extend Forwardable
|
27
|
+
|
28
|
+
def_delegators :@uri, :scheme, :path, :host, :host=, :query, :fragment, :to_s
|
29
|
+
|
30
|
+
def initialize(uri_str)
|
31
|
+
@uri = ::Addressable::URI.parse(uri_str)
|
32
|
+
raise URI::InvalidURIError, "'#{uri_str}' not a valid URI" unless @uri
|
33
|
+
end
|
34
|
+
|
35
|
+
def query_hash
|
36
|
+
@query_hash ||= CGI::parse(self.query || '').tap do |query_hash|
|
37
|
+
# By default, CGI::parse produces lots of arrays. Usually they have a single element
|
38
|
+
# in them. That's correct but not terribly usable. Fix it here.
|
39
|
+
query_hash.each_pair { |k, v| query_hash[k] = v[0] if v.length == 1 }
|
40
|
+
query_hash.extend QueryHash
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def query_hash=(value)
|
45
|
+
@query_hash = value
|
46
|
+
@uri.query = @query_hash.to_s == '' ? nil : @query_hash.to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
def query=(query_str)
|
50
|
+
@query_hash = nil
|
51
|
+
@uri.query = query_str == '' ? nil : query_str
|
52
|
+
end
|
53
|
+
|
54
|
+
def self.parse(uri_str)
|
55
|
+
# Deal with known URI spec breaks - leading/trailing spaces and unencoded entities
|
56
|
+
if uri_str.is_a? String
|
57
|
+
uri_str = uri_str.strip.downcase.gsub(' ', '%20')
|
58
|
+
uri_str.gsub!('&', '%26') if uri_str =~ /^mailto:.*&.*/
|
59
|
+
end
|
60
|
+
BLURI.new(uri_str)
|
61
|
+
end
|
62
|
+
|
63
|
+
def has_query?
|
64
|
+
%w(http https).include?(@uri.scheme) && query
|
65
|
+
end
|
66
|
+
|
67
|
+
def canonicalize!(options = {})
|
68
|
+
@uri.scheme = 'http' if @uri.scheme == 'https'
|
69
|
+
|
70
|
+
@uri.path = @uri.path.sub(/\/*$/, '') if @uri.path =~ /^*\/$/
|
71
|
+
@uri.path.gsub!(BLURI.path_escape_char_regex, PATH_ESCAPE_MAPPINGS)
|
72
|
+
@uri.path.gsub!(BLURI.path_unescape_code_regex, PATH_UNESCAPE_MAPPINGS)
|
73
|
+
|
74
|
+
canonicalize_query!(options)
|
75
|
+
|
76
|
+
@uri.fragment = nil
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def canonicalize_query!(options)
|
81
|
+
allow_all = (options[:allow_query] == :all)
|
82
|
+
allowed_keys = [options[:allow_query]].flatten.compact unless allow_all
|
83
|
+
|
84
|
+
query_hash.keep_if do |k, _|
|
85
|
+
allow_all || (allowed_keys.include?(k) || allowed_keys.include?(k.to_sym))
|
86
|
+
end
|
87
|
+
|
88
|
+
self.query_hash = QueryHash[query_hash.sort_by { |k, _| k }]
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# Generate a regex which matches all characters in PATH_ESCAPE_MAPPINGS
|
93
|
+
def self.path_escape_char_regex
|
94
|
+
@path_escape_char_regex ||=
|
95
|
+
Regexp.new('[' + PATH_ESCAPE_MAPPINGS.keys.map do |char|
|
96
|
+
REQUIRE_REGEX_ESCAPE.include?(char) ? "\\#{char}" : char
|
97
|
+
end.join + ']')
|
98
|
+
end
|
99
|
+
|
100
|
+
##
|
101
|
+
# Generate a regex which matches all escape sequences in PATH_UNESCAPE_MAPPINGS
|
102
|
+
def self.path_unescape_code_regex
|
103
|
+
@path_unescape_code_regex ||= Regexp.new(
|
104
|
+
PATH_UNESCAPE_MAPPINGS.keys.map { |code| "(?:#{code})" }.join('|')
|
105
|
+
)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
module Kernel
|
111
|
+
def BLURI(uri_str)
|
112
|
+
::URI::BLURI.parse(uri_str)
|
113
|
+
end
|
114
|
+
|
115
|
+
module_function :BLURI
|
116
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module URI
|
2
|
+
##
|
3
|
+
# Extends a hash with query string rendering/semi-indifferent access
|
4
|
+
module QueryHash
|
5
|
+
def [](key)
|
6
|
+
item = super key
|
7
|
+
item = super(key.to_s) if item.nil? || item.length == 0
|
8
|
+
item.class == Array && item.length == 0 ? nil : item
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
keys.map { |key| render_value(key, self[key]) }.join('&')
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Creates a new hash populated with the given objects.
|
17
|
+
def self.[](value)
|
18
|
+
Hash[value].tap do |hash|
|
19
|
+
hash.extend(QueryHash)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def render_value(key, value)
|
26
|
+
case value
|
27
|
+
when nil then key
|
28
|
+
when Array then value.map { |el| render_value(key, el) }.join('&')
|
29
|
+
else URI.encode_www_form_component(key) << '=' << URI.encode_www_form_component(value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/optic14n.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'optic14n/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'optic14n'
|
8
|
+
spec.version = Optic14n::VERSION
|
9
|
+
spec.authors = ['Russell Garner']
|
10
|
+
spec.email = %w(rgarner@zephyros-systems.co.uk)
|
11
|
+
spec.description = %q{Canonicalises URLs.}
|
12
|
+
spec.summary = %q{Specifically, HTTP URLs, for a limited purpose}
|
13
|
+
spec.homepage = ''
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = %w(lib)
|
20
|
+
|
21
|
+
spec.add_dependency 'addressable', '~> 2.3'
|
22
|
+
|
23
|
+
spec.add_development_dependency 'rake'
|
24
|
+
spec.add_development_dependency 'gem_publisher', '~> 1.3.0'
|
25
|
+
end
|
data/spec/bluri_spec.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe URI::BLURI do
|
4
|
+
it 'should be an HTTP URI' do
|
5
|
+
bluri = BLURI('http://some.where.com')
|
6
|
+
bluri.should be_a URI::HTTP
|
7
|
+
end
|
8
|
+
|
9
|
+
it 'should not allow other schemes' do
|
10
|
+
lambda { BLURI('ftp://foo').should raise_error(ArgumentError) }
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should not allow nil' do
|
14
|
+
lambda { BLURI(nil) }.should raise_error(URI::InvalidURIError)
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'supports scheme' do
|
18
|
+
BLURI('http://foo').scheme.should == 'http'
|
19
|
+
end
|
20
|
+
it 'supports host' do
|
21
|
+
BLURI('http://foo').host.should == 'foo'
|
22
|
+
end
|
23
|
+
it 'supports path' do
|
24
|
+
BLURI('http://foo/a/path').path.should == '/a/path'
|
25
|
+
end
|
26
|
+
it 'supports query' do
|
27
|
+
BLURI('http://foo?to=you&you=foo').query.should == 'to=you&you=foo'
|
28
|
+
end
|
29
|
+
it 'supports fragment' do
|
30
|
+
BLURI('http://foo#fragment').fragment.should == 'fragment'
|
31
|
+
end
|
32
|
+
it 'supports mailto:someone@somewhere' do
|
33
|
+
BLURI('mailto:me@there.com').to_s.should == 'mailto:me@there.com'
|
34
|
+
end
|
35
|
+
it 'corrects unencoded ampersands ins mailto' do # http://www.faqs.org/rfcs/rfc2368.html
|
36
|
+
BLURI('mailto:fruit&veg.newcastle@rpa.gsi.gov.uk').to_s.should == 'mailto:fruit%26veg.newcastle@rpa.gsi.gov.uk'
|
37
|
+
end
|
38
|
+
it 'corrects trailing spaces' do
|
39
|
+
BLURI('http://www.newspapersoc.org.uk ').to_s.should == 'http://www.newspapersoc.org.uk'
|
40
|
+
end
|
41
|
+
it 'corrects leading spaces' do
|
42
|
+
BLURI(' http://www.newspapersoc.org.uk').to_s.should == 'http://www.newspapersoc.org.uk'
|
43
|
+
end
|
44
|
+
|
45
|
+
describe 'Query string parsing' do
|
46
|
+
context 'the query string is of HTML-encoded form k=v&q=p' do
|
47
|
+
before do
|
48
|
+
@bluri = BLURI('http://some.com/a/path?itemid=1&type=RESOURCE')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'indexes the query string' do
|
52
|
+
@bluri.query_hash['itemid'].should == '1'
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'allows indexing by symbol' do
|
56
|
+
@bluri.query_hash[:itemid].should == '1'
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'shows nil for absent items' do
|
60
|
+
@bluri.query_hash[:eerie_flash].should == nil
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'indexes the second query string item' do
|
64
|
+
@bluri.query_hash['type'].should == 'resource'
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'allows setting of the query' do
|
68
|
+
@bluri.query = 'furry=really'
|
69
|
+
@bluri.to_s.should == 'http://some.com/a/path?furry=really'
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context 'the querystring is not an HTML-encoded thing' do
|
74
|
+
before do
|
75
|
+
@bluri = BLURI('http://some.com/a/path?foo&bar')
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'retains the query string' do
|
79
|
+
@bluri.query.should == 'foo&bar'
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'has a query hash with empty elements' do
|
83
|
+
@bluri.query_hash['foo'].should == nil
|
84
|
+
@bluri.query_hash['foo'].should == nil
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
data/spec/c14n.t
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# Here for reference, see original at
|
2
|
+
# https://github.com/alphagov/redirector/blob/master/tests/lib/c14n.t
|
3
|
+
|
4
|
+
use strict;
|
5
|
+
use Test::More;
|
6
|
+
require 'lib/c14n.pl';
|
7
|
+
|
8
|
+
#
|
9
|
+
# case
|
10
|
+
#
|
11
|
+
is(c14n_url("http://www.EXAMPLE.COM/Foo/Bar/BAZ"), "http://www.example.com/foo/bar/baz", "c14n URL is lower-case");
|
12
|
+
|
13
|
+
#
|
14
|
+
# protocol
|
15
|
+
#
|
16
|
+
is(c14n_url("https://www.example.com"), "http://www.example.com", "translates protocol to http");
|
17
|
+
|
18
|
+
#
|
19
|
+
# slashes
|
20
|
+
#
|
21
|
+
is(c14n_url("http://www.example.com/"), "http://www.example.com", "drops trailing slash");
|
22
|
+
is(c14n_url("http://www.example.com////"), "http://www.example.com", "drops multiple trailing slashes");
|
23
|
+
|
24
|
+
#
|
25
|
+
# fragment identifier
|
26
|
+
#
|
27
|
+
is(c14n_url("http://www.example.com#foo"), "http://www.example.com", "drops fragment identifier");
|
28
|
+
is(c14n_url("http://www.example.com/#foo"), "http://www.example.com", "drops fragment identifier and slashes");
|
29
|
+
|
30
|
+
#
|
31
|
+
# encoding
|
32
|
+
#
|
33
|
+
is(c14n_url("http://www.example.com/:colon:"), "http://www.example.com/:colon:", "colons");
|
34
|
+
is(c14n_url("http://www.example.com/~tide"), "http://www.example.com/~tide", "tide");
|
35
|
+
is(c14n_url("http://www.example.com/_underscore_"), "http://www.example.com/_underscore_", "underscore");
|
36
|
+
is(c14n_url("http://www.example.com/*asterisk*"), "http://www.example.com/*asterisk*", "asterisk");
|
37
|
+
is(c14n_url("http://www.example.com/(parens)"), "http://www.example.com/(parens)", "parens");
|
38
|
+
is(c14n_url("http://www.example.com/[square-brackets]"), "http://www.example.com/%5bsquare-brackets%5d", "square-brackets");
|
39
|
+
|
40
|
+
is(c14n_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27', "commas and quotes");
|
41
|
+
is(c14n_url("http://www.example.com/problematic-in-curl[]||[and-regexes]"), "http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d", "square brackets and pipes");
|
42
|
+
is(c14n_url("http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21"),
|
43
|
+
'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!',
|
44
|
+
"non-reserved character percent decoding");
|
45
|
+
|
46
|
+
is(c14n_url("https://www.example.com/pound-sign-£"), "http://www.example.com/pound-sign-%c2%a3", "pound sign");
|
47
|
+
|
48
|
+
#
|
49
|
+
# query_strings
|
50
|
+
#
|
51
|
+
is(c14n_url("http://www.example.com?q=foo"), "http://www.example.com", "drops disallowed query-string");
|
52
|
+
is(c14n_url("http://www.example.com/?q=foo"), "http://www.example.com", "drops disallowed query-string after slash");
|
53
|
+
is(c14n_url("http://www.example.com/?q=foo#bar"), "http://www.example.com", "drops disallowed query-string after a slash with fragid");
|
54
|
+
|
55
|
+
is(c14n_url("http://www.example.com?a=1&c=3&b=2", '*'), "http://www.example.com?a=1&b=2&c=3", "query string wildcard value");
|
56
|
+
|
57
|
+
is(c14n_url("http://www.example.com/?q=foo", "q"), "http://www.example.com?q=foo", "allow named query_string parameter");
|
58
|
+
|
59
|
+
is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "sorts query_string values");
|
60
|
+
is(c14n_url("http://www.example.com?c=23&d=1&b=909&e=33&a=1", " b e,c:d, a "), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "accept colon and space separated allowed values");
|
61
|
+
is(c14n_url("http://www.example.com?c=23;d=1;b=909;e=33;a=1", "b,e,c,d,a"), "http://www.example.com?a=1&b=909&c=23&d=1&e=33", "converts matrix URI to query_string");
|
62
|
+
|
63
|
+
is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "topic,item"), "http://www.example.com?item=23444&topic=334499", "allows cherry-picked query_string");
|
64
|
+
is(c14n_url("http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444", "foo,bar,baz"), "http://www.example.com", "no ? for empty query_string values");
|
65
|
+
|
66
|
+
is(c14n_url("http://www.example.com?a=you're_dangerous", '*'), "http://www.example.com?a=you%27re_dangerous", "escape query string values");
|
67
|
+
|
68
|
+
#
|
69
|
+
# normalise url
|
70
|
+
#
|
71
|
+
is(normalise_url("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'"), 'http://www.example.com/commas%2cand-%22quotes%22-make-CSV-harder-to-%27awk%27', "commas and quotes");
|
data/spec/c14n_spec.rb
ADDED
@@ -0,0 +1,163 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe "Paul's tests, translated from Perl" do
|
6
|
+
it 'lowercases URLs' do
|
7
|
+
BLURI('http://www.EXAMPLE.COM/Foo/Bar/BAZ').canonicalize!.to_s.should == 'http://www.example.com/foo/bar/baz'
|
8
|
+
end
|
9
|
+
|
10
|
+
describe 'protocol' do
|
11
|
+
it 'translates protocol to http', reason: 'Reduces our input space, everything public anyway' do
|
12
|
+
BLURI('https://www.example.com').canonicalize!.to_s.should == 'http://www.example.com'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'slashes' do
|
17
|
+
it 'drops single trailing slashes' do
|
18
|
+
BLURI('http://www.example.com/').canonicalize!.to_s.should == 'http://www.example.com'
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'drops multiple trailing slashes' do
|
22
|
+
BLURI('http://www.example.com////').canonicalize!.to_s.should == 'http://www.example.com'
|
23
|
+
end
|
24
|
+
|
25
|
+
it 'drops multiple trailing slashes on the path' do
|
26
|
+
BLURI('http://www.example.com/foo///').canonicalize!.to_s.should == 'http://www.example.com/foo'
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe 'fragments' do
|
31
|
+
it 'drops fragment identifier', reason: 'They won''t be mapped, so are redundant' do
|
32
|
+
BLURI('http://www.example.com#foo').canonicalize!.to_s.should == 'http://www.example.com'
|
33
|
+
end
|
34
|
+
it 'drops fragment identifier and slashes' do
|
35
|
+
BLURI('http://www.example.com/#foo').canonicalize!.to_s.should == 'http://www.example.com'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe 'Things to keep verbatim or encode', reason: 'http://tools.ietf.org/html/rfc3986' do
|
40
|
+
it 'retains colons' do
|
41
|
+
BLURI('http://www.example.com/:colon:').canonicalize!.to_s.should == 'http://www.example.com/:colon:'
|
42
|
+
end
|
43
|
+
it 'retains tilde' do
|
44
|
+
BLURI('http://www.example.com/~tilde').canonicalize!.to_s.should == 'http://www.example.com/~tilde'
|
45
|
+
end
|
46
|
+
it 'retains underscores' do
|
47
|
+
BLURI('http://www.example.com/_underscore_').canonicalize!.to_s.should == 'http://www.example.com/_underscore_'
|
48
|
+
end
|
49
|
+
it 'retains asterisks' do
|
50
|
+
BLURI('http://www.example.com/*asterisk*').canonicalize!.to_s.should == 'http://www.example.com/*asterisk*'
|
51
|
+
end
|
52
|
+
it 'retains parens' do
|
53
|
+
BLURI('http://www.example.com/(parens)').canonicalize!.to_s.should == 'http://www.example.com/(parens)'
|
54
|
+
end
|
55
|
+
it 'escapes square brackets' do
|
56
|
+
BLURI('http://www.example.com/[square-brackets]').canonicalize!.to_s.should == 'http://www.example.com/%5bsquare-brackets%5d'
|
57
|
+
end
|
58
|
+
it 'encodes commas and quotes', reason: 'They make csv harder to awk' do
|
59
|
+
BLURI("http://www.example.com/commas,and-\"quotes\"-make-CSV-harder-to-'awk'").canonicalize!.to_s.should ==
|
60
|
+
'http://www.example.com/commas%2cand-%22quotes%22-make-csv-harder-to-%27awk%27'
|
61
|
+
end
|
62
|
+
it 'encodes square brackets and pipes', reason: "It's problematic in curl and regexes" do
|
63
|
+
BLURI('http://www.example.com/problematic-in-curl[]||[and-regexes]').canonicalize!.to_s.should ==
|
64
|
+
'http://www.example.com/problematic-in-curl%5b%5d%7c%7c%5band-regexes%5d'
|
65
|
+
end
|
66
|
+
it 'decodes non-reserved characters (! and ~)' do
|
67
|
+
# My god, it's full of stars
|
68
|
+
BLURI('http://www.example.com/%7eyes%20I%20have%20now%20read%20%5brfc%203986%5d%2C%20%26%20I%27m%20a%20%3Dlot%3D%20more%20reassured%21%21').
|
69
|
+
canonicalize!.to_s.should == 'http://www.example.com/~yes%20i%20have%20now%20read%20%5brfc%203986%5d%2c%20%26%20i%27m%20a%20%3dlot%3d%20more%20reassured!!'
|
70
|
+
end
|
71
|
+
it 'encodes pound signs' do
|
72
|
+
BLURI('https://www.example.com/pound-sign-£').canonicalize!.to_s.should == 'http://www.example.com/pound-sign-%c2%a3'
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
describe 'query strings' do
|
77
|
+
it 'disallows all query string params by default' do
|
78
|
+
BLURI('http://www.example.com?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
|
79
|
+
end
|
80
|
+
it 'disallows all params when there''s a slash' do
|
81
|
+
BLURI('http://www.example.com/?q=foo').canonicalize!.to_s.should == 'http://www.example.com'
|
82
|
+
end
|
83
|
+
it 'disallows all params after a slash with fragid' do
|
84
|
+
BLURI('http://www.example.com/?q=foo#bar').canonicalize!.to_s.should == 'http://www.example.com'
|
85
|
+
end
|
86
|
+
|
87
|
+
describe 'allowing some or all query string values' do
|
88
|
+
it 'allows named query_string parameters' do
|
89
|
+
BLURI('http://www.example.com/?q=foo&r=bar').canonicalize!(allow_query: 'q').to_s.should ==
|
90
|
+
'http://www.example.com?q=foo'
|
91
|
+
end
|
92
|
+
it 'sorts query string values' do
|
93
|
+
BLURI('http://www.example.com?c=23&d=1&b=909&e=33&a=1').
|
94
|
+
canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
|
95
|
+
end
|
96
|
+
it 'encodes querystring values' do
|
97
|
+
BLURI("http://www.example.com?a=you're_dangerous").canonicalize!(allow_query: :all).to_s.should ==
|
98
|
+
'http://www.example.com?a=you%27re_dangerous'
|
99
|
+
end
|
100
|
+
it 'whitelists and sorts query strings' do
|
101
|
+
BLURI('http://www.example.com?a=1&c=3&b=2').canonicalize!(allow_query: :all).to_s.should ==
|
102
|
+
'http://www.example.com?a=1&b=2&c=3'
|
103
|
+
end
|
104
|
+
it 'converts matrix URI to query_string' do
|
105
|
+
BLURI('http://www.example.com?c=23;d=1;b=909;e=33;a=1').
|
106
|
+
canonicalize!(allow_query: [:b,:e,:c,:d,:a]).to_s.should == 'http://www.example.com?a=1&b=909&c=23&d=1&e=33'
|
107
|
+
end
|
108
|
+
it 'sorts cherry-picked query string arguments' do
|
109
|
+
BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
|
110
|
+
canonicalize!(allow_query: [:topic, :item]).to_s.should == 'http://www.example.com?item=23444&topic=334499'
|
111
|
+
end
|
112
|
+
it 'ignores empty querystring values' do
|
113
|
+
BLURI('http://www.example.com?a=2322sdfsf&topic=334499&q=909&item=23444').
|
114
|
+
canonicalize!(allow_query: %w(foo bar baz)).to_s.should == 'http://www.example.com'
|
115
|
+
end
|
116
|
+
|
117
|
+
describe 'querystrings that are not an HTML-encoded thing' do
|
118
|
+
before do
|
119
|
+
@bluri = BLURI('http://some.com/a/path?foo&bar').canonicalize!(allow_query: :all)
|
120
|
+
end
|
121
|
+
|
122
|
+
it 'retains the query string' do
|
123
|
+
@bluri.query.should == 'bar&foo'
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'has a query hash with empty elements' do
|
127
|
+
@bluri.query_hash['foo'].should == nil
|
128
|
+
@bluri.query_hash['bar'].should == nil
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'renders the string properly' do
|
132
|
+
@bluri.query_hash.to_s.should == 'bar&foo'
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
describe 'degenerate cases' do
|
138
|
+
describe 'the treatment of query strings when there are query string octets that unescape to '\
|
139
|
+
'invalid UTF-8 sequences (we no longer treat these as failures)' do
|
140
|
+
it 'no longer raises exceptions when there are bad things in query values' do
|
141
|
+
BLURI('http://example.com/path?view=%ED').
|
142
|
+
canonicalize!(allow_query: :all).
|
143
|
+
to_s.should eql('http://example.com/path?view=%ED')
|
144
|
+
end
|
145
|
+
|
146
|
+
it 're-encodes correctly when there are bad things in query keys' do
|
147
|
+
BLURI('http://example.com/path?%ED=view').
|
148
|
+
canonicalize!(allow_query: :all).
|
149
|
+
to_s.should eql('http://example.com/path?%ED=view')
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
describe 'failure to canonicalize paths correctly' do
|
154
|
+
# see https://www.pivotaltracker.com/s/projects/860575/stories/54502932
|
155
|
+
|
156
|
+
subject { BLURI('http://www.voa.gov.uk/stuff/?query=thing').canonicalize!(allow_query: :all) }
|
157
|
+
|
158
|
+
its(:path) { should eql('/stuff') }
|
159
|
+
its(:query) { should eql('query=thing') }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Optic14n::CanonicalizedUrls do
|
4
|
+
describe 'c14nize' do
|
5
|
+
let(:test_urls) do
|
6
|
+
%w(
|
7
|
+
http://www.qhm.mod.uk/portsmouth/leisure/fuel
|
8
|
+
http://www.qhm.mod.uk/portsmouth/leisure/lntm?
|
9
|
+
http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view
|
10
|
+
http://www.qhm.mod.uk/portsmouth/leisure/lntm?action=view&id=199
|
11
|
+
http://unistats.direct.gov.uk/searchResults.do?pname=institutesearchresults&level3Subjects=L3.90%AC10007761%ACFIRSTDEGREE%ACFulltime%AC430%ACNo%AC60%ACYes%AC83%ACNo%ACYes
|
12
|
+
1234://123
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
context 'options[:allow_query] is false' do
|
17
|
+
subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: false) }
|
18
|
+
|
19
|
+
it { should be_a(Optic14n::CanonicalizedUrls) }
|
20
|
+
|
21
|
+
its(:seen) { should eql(6) }
|
22
|
+
|
23
|
+
describe 'the output set' do
|
24
|
+
subject(:output_set) { c14nizer.output_set }
|
25
|
+
|
26
|
+
its(:size) { should eql(3) }
|
27
|
+
|
28
|
+
describe 'the items' do
|
29
|
+
subject { output_set.map(&:to_s) }
|
30
|
+
|
31
|
+
it { should include('http://www.qhm.mod.uk/portsmouth/leisure/fuel') }
|
32
|
+
it { should include('http://www.qhm.mod.uk/portsmouth/leisure/lntm') }
|
33
|
+
it { should include('http://unistats.direct.gov.uk/searchresults.do') }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context 'options[:allow_query] is :all' do
|
39
|
+
subject(:c14nizer) { Optic14n::CanonicalizedUrls.from_urls(test_urls, allow_query: :all) }
|
40
|
+
|
41
|
+
describe 'the output set' do
|
42
|
+
subject(:output_set) { c14nizer.output_set }
|
43
|
+
|
44
|
+
its(:size) { should eql(5) }
|
45
|
+
end
|
46
|
+
|
47
|
+
describe 'failures' do
|
48
|
+
subject(:failures) { c14nizer.failures }
|
49
|
+
|
50
|
+
it { should be_a(Hash) }
|
51
|
+
|
52
|
+
it 'has our last URL and an error' do
|
53
|
+
e = failures[test_urls.last]
|
54
|
+
e.should be_an(Addressable::URI::InvalidURIError)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe URI::QueryHash do
|
4
|
+
subject(:hash) { {}.extend URI::QueryHash }
|
5
|
+
|
6
|
+
its(:to_s) { should eql('') }
|
7
|
+
|
8
|
+
describe 'setting a value by symbol' do
|
9
|
+
before { hash['x'] = '1' }
|
10
|
+
|
11
|
+
its([:x]) { should eql('1') }
|
12
|
+
its(['x']) { should eql('1') }
|
13
|
+
its(:to_s) { should eql('x=1') }
|
14
|
+
end
|
15
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'optic14n'
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe URI::QueryHash do
|
4
|
+
describe 'non-HTML encoded query strings' do
|
5
|
+
subject { { 'foo' => nil, 'bar' => nil }.extend URI::QueryHash }
|
6
|
+
|
7
|
+
its(['foo']) { should be_nil }
|
8
|
+
its(['bar']) { should be_nil }
|
9
|
+
its(:to_s) { should eql('foo&bar') }
|
10
|
+
end
|
11
|
+
end
|
metadata
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: optic14n
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Russell Garner
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: addressable
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '2.3'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: gem_publisher
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.3.0
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.3.0
|
62
|
+
description: Canonicalises URLs.
|
63
|
+
email:
|
64
|
+
- rgarner@zephyros-systems.co.uk
|
65
|
+
executables: []
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- .gitignore
|
70
|
+
- Gemfile
|
71
|
+
- LICENSE.txt
|
72
|
+
- README.md
|
73
|
+
- Rakefile
|
74
|
+
- jenkins.sh
|
75
|
+
- lib/optic14n.rb
|
76
|
+
- lib/optic14n/canonicalized_urls.rb
|
77
|
+
- lib/optic14n/version.rb
|
78
|
+
- lib/tasks/measure_reduction.rake
|
79
|
+
- lib/uri/bluri.rb
|
80
|
+
- lib/uri/query_hash.rb
|
81
|
+
- optic14n.gemspec
|
82
|
+
- spec/bluri_spec.rb
|
83
|
+
- spec/c14n.t
|
84
|
+
- spec/c14n_spec.rb
|
85
|
+
- spec/canonicalized_urls_spec.rb
|
86
|
+
- spec/query_hash_spec.rb
|
87
|
+
- spec/spec_helper.rb
|
88
|
+
- spec/uri/query_hash_spec.rb
|
89
|
+
homepage: ''
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
post_install_message:
|
93
|
+
rdoc_options: []
|
94
|
+
require_paths:
|
95
|
+
- lib
|
96
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
hash: 2602697415991458495
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
none: false
|
107
|
+
requirements:
|
108
|
+
- - ! '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
segments:
|
112
|
+
- 0
|
113
|
+
hash: 2602697415991458495
|
114
|
+
requirements: []
|
115
|
+
rubyforge_project:
|
116
|
+
rubygems_version: 1.8.23
|
117
|
+
signing_key:
|
118
|
+
specification_version: 3
|
119
|
+
summary: Specifically, HTTP URLs, for a limited purpose
|
120
|
+
test_files:
|
121
|
+
- spec/bluri_spec.rb
|
122
|
+
- spec/c14n.t
|
123
|
+
- spec/c14n_spec.rb
|
124
|
+
- spec/canonicalized_urls_spec.rb
|
125
|
+
- spec/query_hash_spec.rb
|
126
|
+
- spec/spec_helper.rb
|
127
|
+
- spec/uri/query_hash_spec.rb
|