rack-utf8_sanitizer 1.1.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: be92627f205eac2e80ab554ad5add8ca2a084026
4
+ data.tar.gz: 5325281ed65e1e17d74cabbb75df877e0387e2e8
5
+ SHA512:
6
+ metadata.gz: a3cc3842501b3c7e7bd4da92ae81432a6b6d014ab4af8a828a4269d781554fdb774e6c7fe3dd19ea582d959ea8d3093abc1984882587aede7084bf6e3381c10c
7
+ data.tar.gz: 108c557d20ce9c4716ae5d89fcd2878b8eb6ef3cd8a359d3e240a2cef575ef9a69a9cd129f8ab00bbde5628badb5f4d7f3f374badcaa04693162fa2f40531ac0
@@ -4,12 +4,10 @@ rvm:
4
4
  - 1.9.2
5
5
  - 1.9.3
6
6
  - 2.0.0
7
- - jruby-19mode
8
- - rbx-19mode
7
+ # 2.1, not 2.1.0 until fixed https://github.com/travis-ci/travis-ci/issues/2220
8
+ - 2.1
9
+ - jruby
10
+ - rbx-2
9
11
 
10
12
  script:
11
13
  - rake spec
12
-
13
- matrix:
14
- allow_failures:
15
- - rvm: jruby-19mode
data/README.md CHANGED
@@ -19,7 +19,7 @@ Or install it yourself as:
19
19
  For Rails, add this to your `application.rb`:
20
20
 
21
21
  ``` ruby
22
- config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
22
+ config.middleware.insert_before "Rack::Runtime", Rack::UTF8Sanitizer
23
23
  ```
24
24
 
25
25
  For Rack apps, add this to `config.ru`:
@@ -30,7 +30,7 @@ use Rack::UTF8Sanitizer
30
30
 
31
31
  ## Usage
32
32
 
33
- Rack::UTF8Sanitizer divides all keys in the [Rack environment](rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`.
33
+ Rack::UTF8Sanitizer divides all keys in the [Rack environment](http://rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`.
34
34
 
35
35
  The generic sanitization algorithm is as follows:
36
36
 
@@ -1,9 +1,12 @@
1
1
  # encoding: ascii-8bit
2
2
 
3
3
  require 'uri'
4
+ require 'stringio'
4
5
 
5
6
  module Rack
6
7
  class UTF8Sanitizer
8
+ StringIO = ::StringIO
9
+
7
10
  def initialize(app)
8
11
  @app = app
9
12
  end
@@ -20,30 +23,27 @@ module Rack
20
23
  HTTP_REFERER
21
24
  )
22
25
 
26
+ SANITIZABLE_CONTENT_TYPES = %w(
27
+ text/plain
28
+ application/x-www-form-urlencoded
29
+ )
30
+
31
+ # MRI-optimization
32
+ POST = 'POST'
33
+ PUT = 'PUT'
34
+
23
35
  def sanitize(env)
36
+ request_method = env['REQUEST_METHOD']
37
+ if request_method == POST || request_method == PUT
38
+ sanitize_rack_input(env)
39
+ end
24
40
  env.each do |key, value|
25
41
  if URI_FIELDS.include?(key)
26
- # URI.encode/decode expect the input to be in ASCII-8BIT.
27
- # However, there could be invalid UTF-8 characters both in
28
- # raw and percent-encoded form.
29
- #
30
- # So, first sanitize the value, then percent-decode it while
31
- # treating as UTF-8, then sanitize the result and encode it back.
32
- #
33
- # The result is guaranteed to be UTF-8-safe.
34
-
35
- decoded_value = unescape_unreserved(
36
- sanitize_string(value).
37
- force_encoding('ASCII-8BIT'))
38
-
39
42
  env[key] = transfer_frozen(value,
40
- escape_unreserved(
41
- sanitize_string(decoded_value)))
42
-
43
- elsif key =~ /^HTTP_/
43
+ sanitize_uri_encoded_string(value))
44
+ elsif key.start_with?("HTTP_")
44
45
  # Just sanitize the headers and leave them in UTF-8. There is
45
46
  # no reason to have UTF-8 in headers, but if it's valid, let it be.
46
-
47
47
  env[key] = transfer_frozen(value,
48
48
  sanitize_string(value))
49
49
  end
@@ -52,6 +52,73 @@ module Rack
52
52
 
53
53
  protected
54
54
 
55
+ def sanitize_rack_input(env)
56
+ # https://github.com/rack/rack/blob/master/lib/rack/request.rb#L42
57
+ # Logic borrowed from Rack::Request#media_type,#media_type_params,#content_charset
58
+ # Ignoring charset in content type.
59
+ content_type = env['CONTENT_TYPE']
60
+ content_type &&= content_type.split(/\s*[;,]\s*/, 2).first
61
+ content_type &&= content_type.downcase
62
+ return unless SANITIZABLE_CONTENT_TYPES.any? {|type| content_type == type }
63
+ env['rack.input'] &&= sanitize_io(env['rack.input'])
64
+ end
65
+
66
+ # Modeled after Rack::RewindableInput
67
+ # TODO: Should this delegate any methods to the original io?
68
+ class SanitizedRackInput
69
+ def initialize(original_io, sanitized_io)
70
+ @original_io = original_io
71
+ @sanitized_io = sanitized_io
72
+ end
73
+ def gets
74
+ @sanitized_io.gets
75
+ end
76
+ def read(*args)
77
+ @sanitized_io.read(*args)
78
+ end
79
+ def each(&block)
80
+ @sanitized_io.each(&block)
81
+ end
82
+ def rewind
83
+ @sanitized_io.rewind
84
+ end
85
+ def close
86
+ @sanitized_io.close
87
+ @original_io.close if @original_io.respond_to?(:close)
88
+ end
89
+ end
90
+
91
+ def sanitize_io(io)
92
+ input = io.read
93
+ sanitized_io = transfer_frozen(input,
94
+ sanitize_string(input))
95
+ SanitizedRackInput.new(io, StringIO.new(sanitized_io))
96
+ end
97
+
98
+ # URI.encode/decode expect the input to be in ASCII-8BIT.
99
+ # However, there could be invalid UTF-8 characters both in
100
+ # raw and percent-encoded form.
101
+ #
102
+ # So, first sanitize the value, then percent-decode it while
103
+ # treating as UTF-8, then sanitize the result and encode it back.
104
+ #
105
+ # The result is guaranteed to be UTF-8-safe.
106
+ def sanitize_uri_encoded_string(input)
107
+ decoded_value = decode_string(input)
108
+ reencode_string(decoded_value)
109
+ end
110
+
111
+ def reencode_string(decoded_value)
112
+ escape_unreserved(
113
+ sanitize_string(decoded_value))
114
+ end
115
+
116
+ def decode_string(input)
117
+ unescape_unreserved(
118
+ sanitize_string(input).
119
+ force_encoding(Encoding::ASCII_8BIT))
120
+ end
121
+
55
122
  # This regexp matches all 'unreserved' characters from RFC3986 (2.3),
56
123
  # plus all multibyte UTF-8 characters.
57
124
  UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/
@@ -82,21 +149,21 @@ module Rack
82
149
  UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/
83
150
 
84
151
  # Performs the reverse function of `unescape_unreserved`. Unlike
85
- # the previous function, we can reuse the logic in URI#escape.
152
+ # the previous function, we can reuse the logic in URI#encode
86
153
  def escape_unreserved(input)
87
- URI.escape(input, UNSAFE)
154
+ URI.encode(input, UNSAFE)
88
155
  end
89
156
 
90
157
  def sanitize_string(input)
91
158
  if input.is_a? String
92
- input = input.dup.force_encoding('UTF-8')
159
+ input = input.dup.force_encoding(Encoding::UTF_8)
93
160
 
94
161
  if input.valid_encoding?
95
162
  input
96
163
  else
97
164
  input.
98
- force_encoding('ASCII-8BIT').
99
- encode!('UTF-8',
165
+ force_encoding(Encoding::ASCII_8BIT).
166
+ encode!(Encoding::UTF_8,
100
167
  invalid: :replace,
101
168
  undef: :replace)
102
169
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "rack-utf8_sanitizer"
5
- gem.version = '1.1.0'
5
+ gem.version = '1.2.1'
6
6
  gem.authors = ["Peter Zotov"]
7
7
  gem.email = ["whitequark@whitequark.org"]
8
8
  gem.description = %{Rack::UTF8Sanitizer is a Rack middleware which cleans up } <<
@@ -151,4 +151,99 @@ describe Rack::UTF8Sanitizer do
151
151
  env["REQUEST_PATH"].should.be.frozen
152
152
  end
153
153
  end
154
+
155
+ describe "with form data" do
156
+ def request_env
157
+ @plain_input = "foo bar лол".force_encoding('UTF-8')
158
+ {
159
+ "REQUEST_METHOD" => "POST",
160
+ "CONTENT_TYPE" => "application/x-www-form-urlencoded;foo=bar",
161
+ "HTTP_USER_AGENT" => @plain_input,
162
+ "rack.input" => @rack_input,
163
+ }
164
+ end
165
+ def sanitize_form_data(request_env = request_env)
166
+ @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
167
+ env = @app.(request_env)
168
+ sanitized_input = env['rack.input'].read
169
+ yield sanitized_input if block_given?
170
+ env['rack.input'].rewind
171
+ behaves_like :does_sanitize_plain
172
+ behaves_like :does_sanitize_uri
173
+ behaves_like :identity_plain
174
+ behaves_like :identity_uri
175
+ env['rack.input'].close
176
+ end
177
+
178
+ it "sanitizes StringIO rack.input" do
179
+ input = "foo=bla&quux=bar"
180
+ @rack_input = StringIO.new input
181
+
182
+ sanitize_form_data do |sanitized_input|
183
+ sanitized_input.encoding.should == Encoding::UTF_8
184
+ sanitized_input.should.be.valid_encoding
185
+ sanitized_input.should == input
186
+ end
187
+ end
188
+
189
+ it "sanitizes StringIO rack.input with bad encoding" do
190
+ input = "foo=bla&quux=bar\xED"
191
+ @rack_input = StringIO.new input
192
+
193
+ sanitize_form_data do |sanitized_input|
194
+ sanitized_input.encoding.should == Encoding::UTF_8
195
+ sanitized_input.should.be.valid_encoding
196
+ sanitized_input.should != input
197
+ end
198
+ end
199
+
200
+ it "sanitizes non-StringIO rack.input" do
201
+ require 'rack/rewindable_input'
202
+ input = "foo=bla&quux=bar"
203
+ @rack_input = Rack::RewindableInput.new(StringIO.new(input))
204
+
205
+ sanitize_form_data do |sanitized_input|
206
+ sanitized_input.encoding.should == Encoding::UTF_8
207
+ sanitized_input.should.be.valid_encoding
208
+ sanitized_input.should == input
209
+ end
210
+ end
211
+
212
+ it "sanitizes non-StringIO rack.input with bad encoding" do
213
+ require 'rack/rewindable_input'
214
+ input = "foo=bla&quux=bar\xED"
215
+ @rack_input = Rack::RewindableInput.new(StringIO.new(input))
216
+
217
+ sanitize_form_data do |sanitized_input|
218
+ sanitized_input.encoding.should == Encoding::UTF_8
219
+ sanitized_input.should.be.valid_encoding
220
+ sanitized_input.should != input
221
+ end
222
+ end
223
+
224
+ it "does not sanitize the rack body if there is no CONTENT_TYPE" do
225
+ input = "foo=bla&quux=bar\xED"
226
+ @rack_input = StringIO.new input
227
+
228
+ env = request_env.update('CONTENT_TYPE' => nil)
229
+ sanitize_form_data(env) do |sanitized_input|
230
+ sanitized_input.encoding.should == Encoding::ASCII_8BIT
231
+ sanitized_input.should.be.valid_encoding
232
+ sanitized_input.should == input
233
+ end
234
+ end
235
+
236
+ it "does not sanitize the rack body if there is empty CONTENT_TYPE" do
237
+ input = "foo=bla&quux=bar\xED"
238
+ @rack_input = StringIO.new input
239
+
240
+ env = request_env.update('CONTENT_TYPE' => '')
241
+ sanitize_form_data(env) do |sanitized_input|
242
+ sanitized_input.encoding.should == Encoding::ASCII_8BIT
243
+ sanitized_input.should.be.valid_encoding
244
+ sanitized_input.should == input
245
+ end
246
+ end
247
+
248
+ end
154
249
  end
metadata CHANGED
@@ -1,62 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rack-utf8_sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
5
- prerelease:
4
+ version: 1.2.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Zotov
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-03-15 00:00:00.000000000 Z
11
+ date: 2014-05-27 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rack
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
19
  version: '1.0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ~>
24
+ - - "~>"
28
25
  - !ruby/object:Gem::Version
29
26
  version: '1.0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: bacon
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - ">="
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - ">="
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: bacon-colored_output
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - ">="
52
46
  - !ruby/object:Gem::Version
53
47
  version: '0'
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - ">="
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
62
55
  description: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8
@@ -67,8 +60,8 @@ executables: []
67
60
  extensions: []
68
61
  extra_rdoc_files: []
69
62
  files:
70
- - .gitignore
71
- - .travis.yml
63
+ - ".gitignore"
64
+ - ".travis.yml"
72
65
  - Gemfile
73
66
  - LICENSE.txt
74
67
  - README.md
@@ -78,27 +71,26 @@ files:
78
71
  - test/test_utf8_sanitizer.rb
79
72
  homepage: http://github.com/whitequark/rack-utf8_sanitizer
80
73
  licenses: []
74
+ metadata: {}
81
75
  post_install_message:
82
76
  rdoc_options: []
83
77
  require_paths:
84
78
  - lib
85
79
  required_ruby_version: !ruby/object:Gem::Requirement
86
- none: false
87
80
  requirements:
88
- - - ! '>='
81
+ - - ">="
89
82
  - !ruby/object:Gem::Version
90
83
  version: '1.9'
91
84
  required_rubygems_version: !ruby/object:Gem::Requirement
92
- none: false
93
85
  requirements:
94
- - - ! '>='
86
+ - - ">="
95
87
  - !ruby/object:Gem::Version
96
88
  version: '0'
97
89
  requirements: []
98
90
  rubyforge_project:
99
- rubygems_version: 1.8.23
91
+ rubygems_version: 2.2.2
100
92
  signing_key:
101
- specification_version: 3
93
+ specification_version: 4
102
94
  summary: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters
103
95
  in request URI and headers.
104
96
  test_files: