rack-utf8_sanitizer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.travis.yml +11 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +48 -0
- data/Rakefile +8 -0
- data/lib/rack/utf8_sanitizer.rb +78 -0
- data/rack-utf8_sanitizer.gemspec +24 -0
- data/test/test_utf8_sanitizer.rb +138 -0
- metadata +106 -0
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Peter Zotov
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
# Rack::UTF8Sanitizer
|
2
|
+
|
3
|
+
Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters in request URI and headers.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'rack-utf8_sanitizer'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install rack-utf8_sanitizer
|
18
|
+
|
19
|
+
For Rails, add this to your `application.rb`:
|
20
|
+
|
21
|
+
config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
|
22
|
+
|
23
|
+
For Rack apps, add this to `config.ru`:
|
24
|
+
|
25
|
+
use Rack::UTF8Sanitizer
|
26
|
+
|
27
|
+
## Usage
|
28
|
+
|
29
|
+
Rack::UTF8Sanitizer divides all keys in the [Rack environment](rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`.
|
30
|
+
|
31
|
+
The generic sanitization algorithm is as follows:
|
32
|
+
|
33
|
+
1. Force the encoding to UTF-8.
|
34
|
+
2. If the result contains invalid characters:
|
35
|
+
1. Force the encoding to ASCII8-BIT.
|
36
|
+
2. Re-encode it as UTF-8, replacing invalid and undefined characters as U+FFFD.
|
37
|
+
|
38
|
+
For fields with "raw data", the algorithm is applied once and the (UTF-8 encoded) result is left in the environment.
|
39
|
+
|
40
|
+
For fields with "percent-encoded data", the algorithm is applied twice to catch both invalid characters appearing as-is and invalid characters appearing in the percent encoding. The percent encoded, ASCII-8BIT encoded result is left in the environment.
|
41
|
+
|
42
|
+
## Contributing
|
43
|
+
|
44
|
+
1. Fork it
|
45
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
46
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
47
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
48
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module Rack
|
4
|
+
class UTF8Sanitizer
|
5
|
+
def initialize(app)
|
6
|
+
@app = app
|
7
|
+
end
|
8
|
+
|
9
|
+
def call(env)
|
10
|
+
@app.call(sanitize(env))
|
11
|
+
end
|
12
|
+
|
13
|
+
# http://rack.rubyforge.org/doc/SPEC.html
|
14
|
+
URI_FIELDS = %w(
|
15
|
+
SCRIPT_NAME
|
16
|
+
REQUEST_PATH REQUEST_URI PATH_INFO
|
17
|
+
QUERY_STRING
|
18
|
+
HTTP_REFERER
|
19
|
+
)
|
20
|
+
|
21
|
+
def sanitize(env)
|
22
|
+
env.each do |key, value|
|
23
|
+
if URI_FIELDS.include?(key)
|
24
|
+
# URI.encode/decode expect the input to be in ASCII-8BIT.
|
25
|
+
# However, there could be invalid UTF-8 characters both in
|
26
|
+
# raw and percent-encoded form.
|
27
|
+
#
|
28
|
+
# So, first sanitize the value, then percent-decode it while
|
29
|
+
# treating as UTF-8, then sanitize the result and encode it back.
|
30
|
+
#
|
31
|
+
# The result is guaranteed to be UTF-8-safe.
|
32
|
+
|
33
|
+
decoded_value = URI.decode(
|
34
|
+
sanitize_string(value).
|
35
|
+
force_encoding('ASCII-8BIT'))
|
36
|
+
|
37
|
+
env[key] = transfer_frozen(value,
|
38
|
+
URI.encode(sanitize_string(decoded_value)))
|
39
|
+
|
40
|
+
elsif key =~ /^HTTP_/
|
41
|
+
# Just sanitize the headers and leave them in UTF-8. There is
|
42
|
+
# no reason to have UTF-8 in headers, but if it's valid, let it be.
|
43
|
+
|
44
|
+
env[key] = transfer_frozen(value,
|
45
|
+
sanitize_string(value))
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
protected
|
51
|
+
|
52
|
+
def sanitize_string(input)
|
53
|
+
if input.is_a? String
|
54
|
+
input = input.dup.force_encoding('UTF-8')
|
55
|
+
|
56
|
+
if input.valid_encoding?
|
57
|
+
input
|
58
|
+
else
|
59
|
+
input.
|
60
|
+
force_encoding('ASCII-8BIT').
|
61
|
+
encode!('UTF-8',
|
62
|
+
invalid: :replace,
|
63
|
+
undef: :replace)
|
64
|
+
end
|
65
|
+
else
|
66
|
+
input
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def transfer_frozen(from, to)
|
71
|
+
if from.frozen?
|
72
|
+
to.freeze
|
73
|
+
else
|
74
|
+
to
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |gem|
|
4
|
+
gem.name = "rack-utf8_sanitizer"
|
5
|
+
gem.version = '1.0.0'
|
6
|
+
gem.authors = ["Peter Zotov"]
|
7
|
+
gem.email = ["whitequark@whitequark.org"]
|
8
|
+
gem.description = %{Rack::UTF8Sanitizer is a Rack middleware which cleans up } <<
|
9
|
+
%{invalid UTF8 characters in request URI and headers.}
|
10
|
+
gem.summary = gem.description
|
11
|
+
gem.homepage = "http://github.com/whitequark/rack-utf8_sanitizer"
|
12
|
+
|
13
|
+
gem.files = `git ls-files`.split($/)
|
14
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
16
|
+
gem.require_paths = ["lib"]
|
17
|
+
|
18
|
+
gem.required_ruby_version = '>= 1.9'
|
19
|
+
|
20
|
+
gem.add_dependency "rack", '~> 1.0'
|
21
|
+
|
22
|
+
gem.add_development_dependency "bacon"
|
23
|
+
gem.add_development_dependency "bacon-colored_output"
|
24
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
# encoding:ascii-8bit
|
2
|
+
|
3
|
+
require 'bacon/colored_output'
|
4
|
+
require 'rack/utf8_sanitizer'
|
5
|
+
|
6
|
+
describe Rack::UTF8Sanitizer do
|
7
|
+
before do
|
8
|
+
@app = Rack::UTF8Sanitizer.new(-> env { env })
|
9
|
+
end
|
10
|
+
|
11
|
+
shared :does_sanitize_plain do
|
12
|
+
it "sanitizes plaintext entity (HTTP_USER_AGENT)" do
|
13
|
+
env = @app.({ "HTTP_USER_AGENT" => @plain_input })
|
14
|
+
result = env["HTTP_USER_AGENT"]
|
15
|
+
|
16
|
+
result.encoding.should == Encoding::UTF_8
|
17
|
+
result.should.be.valid_encoding
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
shared :does_sanitize_uri do
|
22
|
+
it "sanitizes URI-like entity (REQUEST_PATH)" do
|
23
|
+
env = @app.({ "REQUEST_PATH" => @uri_input })
|
24
|
+
result = env["REQUEST_PATH"]
|
25
|
+
|
26
|
+
result.encoding.should == Encoding::US_ASCII
|
27
|
+
result.should.be.valid_encoding
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "with invalid UTF-8 input" do
|
32
|
+
before do
|
33
|
+
@plain_input = "foo\xe0".force_encoding('UTF-8')
|
34
|
+
@uri_input = "foo%E0".force_encoding('UTF-8')
|
35
|
+
end
|
36
|
+
|
37
|
+
behaves_like :does_sanitize_plain
|
38
|
+
behaves_like :does_sanitize_uri
|
39
|
+
end
|
40
|
+
|
41
|
+
describe "with invalid, incorrectly percent-encoded UTF-8 URI input" do
|
42
|
+
before do
|
43
|
+
@uri_input = "foo%E0\xe0".force_encoding('UTF-8')
|
44
|
+
end
|
45
|
+
|
46
|
+
behaves_like :does_sanitize_uri
|
47
|
+
end
|
48
|
+
|
49
|
+
describe "with invalid ASCII-8BIT input" do
|
50
|
+
before do
|
51
|
+
@plain_input = "foo\xe0"
|
52
|
+
@uri_input = "foo%E0"
|
53
|
+
end
|
54
|
+
|
55
|
+
behaves_like :does_sanitize_plain
|
56
|
+
behaves_like :does_sanitize_uri
|
57
|
+
end
|
58
|
+
|
59
|
+
describe "with invalid, incorrectly percent-encoded ASCII-8BIT URI input" do
|
60
|
+
before do
|
61
|
+
@uri_input = "foo%E0\xe0"
|
62
|
+
end
|
63
|
+
|
64
|
+
behaves_like :does_sanitize_uri
|
65
|
+
end
|
66
|
+
|
67
|
+
shared :identity_plain do
|
68
|
+
it "does not change plaintext entity (HTTP_USER_AGENT)" do
|
69
|
+
env = @app.({ "HTTP_USER_AGENT" => @plain_input })
|
70
|
+
result = env["HTTP_USER_AGENT"]
|
71
|
+
|
72
|
+
result.encoding.should == Encoding::UTF_8
|
73
|
+
result.should.be.valid_encoding
|
74
|
+
result.should == @plain_input
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
shared :identity_uri do
|
79
|
+
it "does not change URI-like entity (REQUEST_PATH)" do
|
80
|
+
env = @app.({ "REQUEST_PATH" => @uri_input })
|
81
|
+
result = env["REQUEST_PATH"]
|
82
|
+
|
83
|
+
result.encoding.should == Encoding::US_ASCII
|
84
|
+
result.should.be.valid_encoding
|
85
|
+
result.should == @uri_input
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe "with valid UTF-8 input" do
|
90
|
+
before do
|
91
|
+
@plain_input = "foo bar лол".force_encoding('UTF-8')
|
92
|
+
@uri_input = "foo+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
|
93
|
+
end
|
94
|
+
|
95
|
+
behaves_like :identity_plain
|
96
|
+
behaves_like :identity_uri
|
97
|
+
end
|
98
|
+
|
99
|
+
describe "with valid, not percent-encoded UTF-8 URI input" do
|
100
|
+
before do
|
101
|
+
@uri_input = "foo+bar+лол".force_encoding('UTF-8')
|
102
|
+
end
|
103
|
+
|
104
|
+
it "does not change URI-like entity (REQUEST_PATH)" do
|
105
|
+
env = @app.({ "REQUEST_PATH" => @uri_input })
|
106
|
+
result = env["REQUEST_PATH"]
|
107
|
+
|
108
|
+
result.encoding.should == Encoding::US_ASCII
|
109
|
+
result.should.be.valid_encoding
|
110
|
+
result.should == URI.encode(@uri_input)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
describe "with valid ASCII-8BIT input" do
|
115
|
+
before do
|
116
|
+
@plain_input = "bar baz"
|
117
|
+
@uri_input = "bar+baz"
|
118
|
+
end
|
119
|
+
|
120
|
+
behaves_like :identity_plain
|
121
|
+
behaves_like :identity_uri
|
122
|
+
end
|
123
|
+
|
124
|
+
describe "with frozen strings" do
|
125
|
+
before do
|
126
|
+
@plain_input = "bar baz".freeze
|
127
|
+
@uri_input = "bar+baz".freeze
|
128
|
+
end
|
129
|
+
|
130
|
+
it "preserves the frozen? status of input" do
|
131
|
+
env = @app.({ "HTTP_USER_AGENT" => @plain_input,
|
132
|
+
"REQUEST_PATH" => @uri_input })
|
133
|
+
|
134
|
+
env["HTTP_USER_AGENT"].should.be.frozen
|
135
|
+
env["REQUEST_PATH"].should.be.frozen
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rack-utf8_sanitizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Peter Zotov
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-05 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rack
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: bacon
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: bacon-colored_output
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8
|
63
|
+
characters in request URI and headers.
|
64
|
+
email:
|
65
|
+
- whitequark@whitequark.org
|
66
|
+
executables: []
|
67
|
+
extensions: []
|
68
|
+
extra_rdoc_files: []
|
69
|
+
files:
|
70
|
+
- .gitignore
|
71
|
+
- .travis.yml
|
72
|
+
- Gemfile
|
73
|
+
- LICENSE.txt
|
74
|
+
- README.md
|
75
|
+
- Rakefile
|
76
|
+
- lib/rack/utf8_sanitizer.rb
|
77
|
+
- rack-utf8_sanitizer.gemspec
|
78
|
+
- test/test_utf8_sanitizer.rb
|
79
|
+
homepage: http://github.com/whitequark/rack-utf8_sanitizer
|
80
|
+
licenses: []
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
none: false
|
87
|
+
requirements:
|
88
|
+
- - ! '>='
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '1.9'
|
91
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
92
|
+
none: false
|
93
|
+
requirements:
|
94
|
+
- - ! '>='
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
requirements: []
|
98
|
+
rubyforge_project:
|
99
|
+
rubygems_version: 1.8.23
|
100
|
+
signing_key:
|
101
|
+
specification_version: 3
|
102
|
+
summary: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters
|
103
|
+
in request URI and headers.
|
104
|
+
test_files:
|
105
|
+
- test/test_utf8_sanitizer.rb
|
106
|
+
has_rdoc:
|