rack-utf8_sanitizer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+
3
+ rvm:
4
+ - 1.9.2
5
+ - 1.9.3
6
+ - 2.0.0
7
+ - jruby-19mode
8
+ - rbx-19mode
9
+
10
+ script:
11
+ - rake spec
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rack-utf8_sanitizer.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Peter Zotov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,48 @@
1
+ # Rack::UTF8Sanitizer
2
+
3
+ Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters in request URI and headers.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'rack-utf8_sanitizer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install rack-utf8_sanitizer
18
+
19
+ For Rails, add this to your `application.rb`:
20
+
21
+ config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
22
+
23
+ For Rack apps, add this to `config.ru`:
24
+
25
+ use Rack::UTF8Sanitizer
26
+
27
+ ## Usage
28
+
29
+ Rack::UTF8Sanitizer divides all keys in the [Rack environment](rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`.
30
+
31
+ The generic sanitization algorithm is as follows:
32
+
33
+ 1. Force the encoding to UTF-8.
34
+ 2. If the result contains invalid characters:
35
+ 1. Force the encoding to ASCII8-BIT.
36
+ 2. Re-encode it as UTF-8, replacing invalid and undefined characters as U+FFFD.
37
+
38
+ For fields with "raw data", the algorithm is applied once and the (UTF-8 encoded) result is left in the environment.
39
+
40
+ For fields with "percent-encoded data", the algorithm is applied twice to catch both invalid characters appearing as-is and invalid characters appearing in the percent encoding. The percent encoded, ASCII-8BIT encoded result is left in the environment.
41
+
42
+ ## Contributing
43
+
44
+ 1. Fork it
45
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
46
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
47
+ 4. Push to the branch (`git push origin my-new-feature`)
48
+ 5. Create new Pull Request
@@ -0,0 +1,8 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task :default => :spec
4
+
5
+ desc "Run tests"
6
+ task :spec do
7
+ sh 'bacon -a'
8
+ end
@@ -0,0 +1,78 @@
1
+ require 'uri'
2
+
3
+ module Rack
4
+ class UTF8Sanitizer
5
+ def initialize(app)
6
+ @app = app
7
+ end
8
+
9
+ def call(env)
10
+ @app.call(sanitize(env))
11
+ end
12
+
13
+ # http://rack.rubyforge.org/doc/SPEC.html
14
+ URI_FIELDS = %w(
15
+ SCRIPT_NAME
16
+ REQUEST_PATH REQUEST_URI PATH_INFO
17
+ QUERY_STRING
18
+ HTTP_REFERER
19
+ )
20
+
21
+ def sanitize(env)
22
+ env.each do |key, value|
23
+ if URI_FIELDS.include?(key)
24
+ # URI.encode/decode expect the input to be in ASCII-8BIT.
25
+ # However, there could be invalid UTF-8 characters both in
26
+ # raw and percent-encoded form.
27
+ #
28
+ # So, first sanitize the value, then percent-decode it while
29
+ # treating as UTF-8, then sanitize the result and encode it back.
30
+ #
31
+ # The result is guaranteed to be UTF-8-safe.
32
+
33
+ decoded_value = URI.decode(
34
+ sanitize_string(value).
35
+ force_encoding('ASCII-8BIT'))
36
+
37
+ env[key] = transfer_frozen(value,
38
+ URI.encode(sanitize_string(decoded_value)))
39
+
40
+ elsif key =~ /^HTTP_/
41
+ # Just sanitize the headers and leave them in UTF-8. There is
42
+ # no reason to have UTF-8 in headers, but if it's valid, let it be.
43
+
44
+ env[key] = transfer_frozen(value,
45
+ sanitize_string(value))
46
+ end
47
+ end
48
+ end
49
+
50
+ protected
51
+
52
+ def sanitize_string(input)
53
+ if input.is_a? String
54
+ input = input.dup.force_encoding('UTF-8')
55
+
56
+ if input.valid_encoding?
57
+ input
58
+ else
59
+ input.
60
+ force_encoding('ASCII-8BIT').
61
+ encode!('UTF-8',
62
+ invalid: :replace,
63
+ undef: :replace)
64
+ end
65
+ else
66
+ input
67
+ end
68
+ end
69
+
70
+ def transfer_frozen(from, to)
71
+ if from.frozen?
72
+ to.freeze
73
+ else
74
+ to
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = "rack-utf8_sanitizer"
5
+ gem.version = '1.0.0'
6
+ gem.authors = ["Peter Zotov"]
7
+ gem.email = ["whitequark@whitequark.org"]
8
+ gem.description = %{Rack::UTF8Sanitizer is a Rack middleware which cleans up } <<
9
+ %{invalid UTF8 characters in request URI and headers.}
10
+ gem.summary = gem.description
11
+ gem.homepage = "http://github.com/whitequark/rack-utf8_sanitizer"
12
+
13
+ gem.files = `git ls-files`.split($/)
14
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
16
+ gem.require_paths = ["lib"]
17
+
18
+ gem.required_ruby_version = '>= 1.9'
19
+
20
+ gem.add_dependency "rack", '~> 1.0'
21
+
22
+ gem.add_development_dependency "bacon"
23
+ gem.add_development_dependency "bacon-colored_output"
24
+ end
@@ -0,0 +1,138 @@
1
+ # encoding:ascii-8bit
2
+
3
+ require 'bacon/colored_output'
4
+ require 'rack/utf8_sanitizer'
5
+
6
+ describe Rack::UTF8Sanitizer do
7
+ before do
8
+ @app = Rack::UTF8Sanitizer.new(-> env { env })
9
+ end
10
+
11
+ shared :does_sanitize_plain do
12
+ it "sanitizes plaintext entity (HTTP_USER_AGENT)" do
13
+ env = @app.({ "HTTP_USER_AGENT" => @plain_input })
14
+ result = env["HTTP_USER_AGENT"]
15
+
16
+ result.encoding.should == Encoding::UTF_8
17
+ result.should.be.valid_encoding
18
+ end
19
+ end
20
+
21
+ shared :does_sanitize_uri do
22
+ it "sanitizes URI-like entity (REQUEST_PATH)" do
23
+ env = @app.({ "REQUEST_PATH" => @uri_input })
24
+ result = env["REQUEST_PATH"]
25
+
26
+ result.encoding.should == Encoding::US_ASCII
27
+ result.should.be.valid_encoding
28
+ end
29
+ end
30
+
31
+ describe "with invalid UTF-8 input" do
32
+ before do
33
+ @plain_input = "foo\xe0".force_encoding('UTF-8')
34
+ @uri_input = "foo%E0".force_encoding('UTF-8')
35
+ end
36
+
37
+ behaves_like :does_sanitize_plain
38
+ behaves_like :does_sanitize_uri
39
+ end
40
+
41
+ describe "with invalid, incorrectly percent-encoded UTF-8 URI input" do
42
+ before do
43
+ @uri_input = "foo%E0\xe0".force_encoding('UTF-8')
44
+ end
45
+
46
+ behaves_like :does_sanitize_uri
47
+ end
48
+
49
+ describe "with invalid ASCII-8BIT input" do
50
+ before do
51
+ @plain_input = "foo\xe0"
52
+ @uri_input = "foo%E0"
53
+ end
54
+
55
+ behaves_like :does_sanitize_plain
56
+ behaves_like :does_sanitize_uri
57
+ end
58
+
59
+ describe "with invalid, incorrectly percent-encoded ASCII-8BIT URI input" do
60
+ before do
61
+ @uri_input = "foo%E0\xe0"
62
+ end
63
+
64
+ behaves_like :does_sanitize_uri
65
+ end
66
+
67
+ shared :identity_plain do
68
+ it "does not change plaintext entity (HTTP_USER_AGENT)" do
69
+ env = @app.({ "HTTP_USER_AGENT" => @plain_input })
70
+ result = env["HTTP_USER_AGENT"]
71
+
72
+ result.encoding.should == Encoding::UTF_8
73
+ result.should.be.valid_encoding
74
+ result.should == @plain_input
75
+ end
76
+ end
77
+
78
+ shared :identity_uri do
79
+ it "does not change URI-like entity (REQUEST_PATH)" do
80
+ env = @app.({ "REQUEST_PATH" => @uri_input })
81
+ result = env["REQUEST_PATH"]
82
+
83
+ result.encoding.should == Encoding::US_ASCII
84
+ result.should.be.valid_encoding
85
+ result.should == @uri_input
86
+ end
87
+ end
88
+
89
+ describe "with valid UTF-8 input" do
90
+ before do
91
+ @plain_input = "foo bar лол".force_encoding('UTF-8')
92
+ @uri_input = "foo+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
93
+ end
94
+
95
+ behaves_like :identity_plain
96
+ behaves_like :identity_uri
97
+ end
98
+
99
+ describe "with valid, not percent-encoded UTF-8 URI input" do
100
+ before do
101
+ @uri_input = "foo+bar+лол".force_encoding('UTF-8')
102
+ end
103
+
104
+ it "does not change URI-like entity (REQUEST_PATH)" do
105
+ env = @app.({ "REQUEST_PATH" => @uri_input })
106
+ result = env["REQUEST_PATH"]
107
+
108
+ result.encoding.should == Encoding::US_ASCII
109
+ result.should.be.valid_encoding
110
+ result.should == URI.encode(@uri_input)
111
+ end
112
+ end
113
+
114
+ describe "with valid ASCII-8BIT input" do
115
+ before do
116
+ @plain_input = "bar baz"
117
+ @uri_input = "bar+baz"
118
+ end
119
+
120
+ behaves_like :identity_plain
121
+ behaves_like :identity_uri
122
+ end
123
+
124
+ describe "with frozen strings" do
125
+ before do
126
+ @plain_input = "bar baz".freeze
127
+ @uri_input = "bar+baz".freeze
128
+ end
129
+
130
+ it "preserves the frozen? status of input" do
131
+ env = @app.({ "HTTP_USER_AGENT" => @plain_input,
132
+ "REQUEST_PATH" => @uri_input })
133
+
134
+ env["HTTP_USER_AGENT"].should.be.frozen
135
+ env["REQUEST_PATH"].should.be.frozen
136
+ end
137
+ end
138
+ end
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rack-utf8_sanitizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Peter Zotov
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-05 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rack
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: bacon
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: bacon-colored_output
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8
63
+ characters in request URI and headers.
64
+ email:
65
+ - whitequark@whitequark.org
66
+ executables: []
67
+ extensions: []
68
+ extra_rdoc_files: []
69
+ files:
70
+ - .gitignore
71
+ - .travis.yml
72
+ - Gemfile
73
+ - LICENSE.txt
74
+ - README.md
75
+ - Rakefile
76
+ - lib/rack/utf8_sanitizer.rb
77
+ - rack-utf8_sanitizer.gemspec
78
+ - test/test_utf8_sanitizer.rb
79
+ homepage: http://github.com/whitequark/rack-utf8_sanitizer
80
+ licenses: []
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ none: false
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '1.9'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ! '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ requirements: []
98
+ rubyforge_project:
99
+ rubygems_version: 1.8.23
100
+ signing_key:
101
+ specification_version: 3
102
+ summary: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters
103
+ in request URI and headers.
104
+ test_files:
105
+ - test/test_utf8_sanitizer.rb
106
+ has_rdoc: