rack-utf8_sanitizer 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: be92627f205eac2e80ab554ad5add8ca2a084026
4
+ data.tar.gz: 5325281ed65e1e17d74cabbb75df877e0387e2e8
5
+ SHA512:
6
+ metadata.gz: a3cc3842501b3c7e7bd4da92ae81432a6b6d014ab4af8a828a4269d781554fdb774e6c7fe3dd19ea582d959ea8d3093abc1984882587aede7084bf6e3381c10c
7
+ data.tar.gz: 108c557d20ce9c4716ae5d89fcd2878b8eb6ef3cd8a359d3e240a2cef575ef9a69a9cd129f8ab00bbde5628badb5f4d7f3f374badcaa04693162fa2f40531ac0
@@ -4,12 +4,10 @@ rvm:
4
4
  - 1.9.2
5
5
  - 1.9.3
6
6
  - 2.0.0
7
- - jruby-19mode
8
- - rbx-19mode
7
+ # 2.1, not 2.1.0 until fixed https://github.com/travis-ci/travis-ci/issues/2220
8
+ - 2.1
9
+ - jruby
10
+ - rbx-2
9
11
 
10
12
  script:
11
13
  - rake spec
12
-
13
- matrix:
14
- allow_failures:
15
- - rvm: jruby-19mode
data/README.md CHANGED
@@ -19,7 +19,7 @@ Or install it yourself as:
19
19
  For Rails, add this to your `application.rb`:
20
20
 
21
21
  ``` ruby
22
- config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
22
+ config.middleware.insert_before "Rack::Runtime", Rack::UTF8Sanitizer
23
23
  ```
24
24
 
25
25
  For Rack apps, add this to `config.ru`:
@@ -30,7 +30,7 @@ use Rack::UTF8Sanitizer
30
30
 
31
31
  ## Usage
32
32
 
33
- Rack::UTF8Sanitizer divides all keys in the [Rack environment](rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`.
33
+ Rack::UTF8Sanitizer divides all keys in the [Rack environment](http://rack.rubyforge.org/doc/SPEC.html) in two distinct groups: keys which contain raw data and the ones with percent-encoded data. The fields which are treated as percent-encoded are: `SCRIPT_NAME`, `REQUEST_PATH`, `REQUEST_URI`, `PATH_INFO`, `QUERY_STRING`, `HTTP_REFERER`.
34
34
 
35
35
  The generic sanitization algorithm is as follows:
36
36
 
@@ -1,9 +1,12 @@
1
1
  # encoding: ascii-8bit
2
2
 
3
3
  require 'uri'
4
+ require 'stringio'
4
5
 
5
6
  module Rack
6
7
  class UTF8Sanitizer
8
+ StringIO = ::StringIO
9
+
7
10
  def initialize(app)
8
11
  @app = app
9
12
  end
@@ -20,30 +23,27 @@ module Rack
20
23
  HTTP_REFERER
21
24
  )
22
25
 
26
+ SANITIZABLE_CONTENT_TYPES = %w(
27
+ text/plain
28
+ application/x-www-form-urlencoded
29
+ )
30
+
31
+ # MRI-optimization
32
+ POST = 'POST'
33
+ PUT = 'PUT'
34
+
23
35
  def sanitize(env)
36
+ request_method = env['REQUEST_METHOD']
37
+ if request_method == POST || request_method == PUT
38
+ sanitize_rack_input(env)
39
+ end
24
40
  env.each do |key, value|
25
41
  if URI_FIELDS.include?(key)
26
- # URI.encode/decode expect the input to be in ASCII-8BIT.
27
- # However, there could be invalid UTF-8 characters both in
28
- # raw and percent-encoded form.
29
- #
30
- # So, first sanitize the value, then percent-decode it while
31
- # treating as UTF-8, then sanitize the result and encode it back.
32
- #
33
- # The result is guaranteed to be UTF-8-safe.
34
-
35
- decoded_value = unescape_unreserved(
36
- sanitize_string(value).
37
- force_encoding('ASCII-8BIT'))
38
-
39
42
  env[key] = transfer_frozen(value,
40
- escape_unreserved(
41
- sanitize_string(decoded_value)))
42
-
43
- elsif key =~ /^HTTP_/
43
+ sanitize_uri_encoded_string(value))
44
+ elsif key.start_with?("HTTP_")
44
45
  # Just sanitize the headers and leave them in UTF-8. There is
45
46
  # no reason to have UTF-8 in headers, but if it's valid, let it be.
46
-
47
47
  env[key] = transfer_frozen(value,
48
48
  sanitize_string(value))
49
49
  end
@@ -52,6 +52,73 @@ module Rack
52
52
 
53
53
  protected
54
54
 
55
+ def sanitize_rack_input(env)
56
+ # https://github.com/rack/rack/blob/master/lib/rack/request.rb#L42
57
+ # Logic borrowed from Rack::Request#media_type,#media_type_params,#content_charset
58
+ # Ignoring charset in content type.
59
+ content_type = env['CONTENT_TYPE']
60
+ content_type &&= content_type.split(/\s*[;,]\s*/, 2).first
61
+ content_type &&= content_type.downcase
62
+ return unless SANITIZABLE_CONTENT_TYPES.any? {|type| content_type == type }
63
+ env['rack.input'] &&= sanitize_io(env['rack.input'])
64
+ end
65
+
66
+ # Modeled after Rack::RewindableInput
67
+ # TODO: Should this delegate any methods to the original io?
68
+ class SanitizedRackInput
69
+ def initialize(original_io, sanitized_io)
70
+ @original_io = original_io
71
+ @sanitized_io = sanitized_io
72
+ end
73
+ def gets
74
+ @sanitized_io.gets
75
+ end
76
+ def read(*args)
77
+ @sanitized_io.read(*args)
78
+ end
79
+ def each(&block)
80
+ @sanitized_io.each(&block)
81
+ end
82
+ def rewind
83
+ @sanitized_io.rewind
84
+ end
85
+ def close
86
+ @sanitized_io.close
87
+ @original_io.close if @original_io.respond_to?(:close)
88
+ end
89
+ end
90
+
91
+ def sanitize_io(io)
92
+ input = io.read
93
+ sanitized_io = transfer_frozen(input,
94
+ sanitize_string(input))
95
+ SanitizedRackInput.new(io, StringIO.new(sanitized_io))
96
+ end
97
+
98
+ # URI.encode/decode expect the input to be in ASCII-8BIT.
99
+ # However, there could be invalid UTF-8 characters both in
100
+ # raw and percent-encoded form.
101
+ #
102
+ # So, first sanitize the value, then percent-decode it while
103
+ # treating as UTF-8, then sanitize the result and encode it back.
104
+ #
105
+ # The result is guaranteed to be UTF-8-safe.
106
+ def sanitize_uri_encoded_string(input)
107
+ decoded_value = decode_string(input)
108
+ reencode_string(decoded_value)
109
+ end
110
+
111
+ def reencode_string(decoded_value)
112
+ escape_unreserved(
113
+ sanitize_string(decoded_value))
114
+ end
115
+
116
+ def decode_string(input)
117
+ unescape_unreserved(
118
+ sanitize_string(input).
119
+ force_encoding(Encoding::ASCII_8BIT))
120
+ end
121
+
55
122
  # This regexp matches all 'unreserved' characters from RFC3986 (2.3),
56
123
  # plus all multibyte UTF-8 characters.
57
124
  UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/
@@ -82,21 +149,21 @@ module Rack
82
149
  UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/
83
150
 
84
151
  # Performs the reverse function of `unescape_unreserved`. Unlike
85
- # the previous function, we can reuse the logic in URI#escape.
152
+ # the previous function, we can reuse the logic in URI#encode
86
153
  def escape_unreserved(input)
87
- URI.escape(input, UNSAFE)
154
+ URI.encode(input, UNSAFE)
88
155
  end
89
156
 
90
157
  def sanitize_string(input)
91
158
  if input.is_a? String
92
- input = input.dup.force_encoding('UTF-8')
159
+ input = input.dup.force_encoding(Encoding::UTF_8)
93
160
 
94
161
  if input.valid_encoding?
95
162
  input
96
163
  else
97
164
  input.
98
- force_encoding('ASCII-8BIT').
99
- encode!('UTF-8',
165
+ force_encoding(Encoding::ASCII_8BIT).
166
+ encode!(Encoding::UTF_8,
100
167
  invalid: :replace,
101
168
  undef: :replace)
102
169
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "rack-utf8_sanitizer"
5
- gem.version = '1.1.0'
5
+ gem.version = '1.2.1'
6
6
  gem.authors = ["Peter Zotov"]
7
7
  gem.email = ["whitequark@whitequark.org"]
8
8
  gem.description = %{Rack::UTF8Sanitizer is a Rack middleware which cleans up } <<
@@ -151,4 +151,99 @@ describe Rack::UTF8Sanitizer do
151
151
  env["REQUEST_PATH"].should.be.frozen
152
152
  end
153
153
  end
154
+
155
+ describe "with form data" do
156
+ def request_env
157
+ @plain_input = "foo bar лол".force_encoding('UTF-8')
158
+ {
159
+ "REQUEST_METHOD" => "POST",
160
+ "CONTENT_TYPE" => "application/x-www-form-urlencoded;foo=bar",
161
+ "HTTP_USER_AGENT" => @plain_input,
162
+ "rack.input" => @rack_input,
163
+ }
164
+ end
165
+ def sanitize_form_data(request_env = request_env)
166
+ @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
167
+ env = @app.(request_env)
168
+ sanitized_input = env['rack.input'].read
169
+ yield sanitized_input if block_given?
170
+ env['rack.input'].rewind
171
+ behaves_like :does_sanitize_plain
172
+ behaves_like :does_sanitize_uri
173
+ behaves_like :identity_plain
174
+ behaves_like :identity_uri
175
+ env['rack.input'].close
176
+ end
177
+
178
+ it "sanitizes StringIO rack.input" do
179
+ input = "foo=bla&quux=bar"
180
+ @rack_input = StringIO.new input
181
+
182
+ sanitize_form_data do |sanitized_input|
183
+ sanitized_input.encoding.should == Encoding::UTF_8
184
+ sanitized_input.should.be.valid_encoding
185
+ sanitized_input.should == input
186
+ end
187
+ end
188
+
189
+ it "sanitizes StringIO rack.input with bad encoding" do
190
+ input = "foo=bla&quux=bar\xED"
191
+ @rack_input = StringIO.new input
192
+
193
+ sanitize_form_data do |sanitized_input|
194
+ sanitized_input.encoding.should == Encoding::UTF_8
195
+ sanitized_input.should.be.valid_encoding
196
+ sanitized_input.should != input
197
+ end
198
+ end
199
+
200
+ it "sanitizes non-StringIO rack.input" do
201
+ require 'rack/rewindable_input'
202
+ input = "foo=bla&quux=bar"
203
+ @rack_input = Rack::RewindableInput.new(StringIO.new(input))
204
+
205
+ sanitize_form_data do |sanitized_input|
206
+ sanitized_input.encoding.should == Encoding::UTF_8
207
+ sanitized_input.should.be.valid_encoding
208
+ sanitized_input.should == input
209
+ end
210
+ end
211
+
212
+ it "sanitizes non-StringIO rack.input with bad encoding" do
213
+ require 'rack/rewindable_input'
214
+ input = "foo=bla&quux=bar\xED"
215
+ @rack_input = Rack::RewindableInput.new(StringIO.new(input))
216
+
217
+ sanitize_form_data do |sanitized_input|
218
+ sanitized_input.encoding.should == Encoding::UTF_8
219
+ sanitized_input.should.be.valid_encoding
220
+ sanitized_input.should != input
221
+ end
222
+ end
223
+
224
+ it "does not sanitize the rack body if there is no CONTENT_TYPE" do
225
+ input = "foo=bla&quux=bar\xED"
226
+ @rack_input = StringIO.new input
227
+
228
+ env = request_env.update('CONTENT_TYPE' => nil)
229
+ sanitize_form_data(env) do |sanitized_input|
230
+ sanitized_input.encoding.should == Encoding::ASCII_8BIT
231
+ sanitized_input.should.be.valid_encoding
232
+ sanitized_input.should == input
233
+ end
234
+ end
235
+
236
+ it "does not sanitize the rack body if there is empty CONTENT_TYPE" do
237
+ input = "foo=bla&quux=bar\xED"
238
+ @rack_input = StringIO.new input
239
+
240
+ env = request_env.update('CONTENT_TYPE' => '')
241
+ sanitize_form_data(env) do |sanitized_input|
242
+ sanitized_input.encoding.should == Encoding::ASCII_8BIT
243
+ sanitized_input.should.be.valid_encoding
244
+ sanitized_input.should == input
245
+ end
246
+ end
247
+
248
+ end
154
249
  end
metadata CHANGED
@@ -1,62 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rack-utf8_sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
5
- prerelease:
4
+ version: 1.2.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Peter Zotov
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-03-15 00:00:00.000000000 Z
11
+ date: 2014-05-27 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rack
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ~>
17
+ - - "~>"
20
18
  - !ruby/object:Gem::Version
21
19
  version: '1.0'
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ~>
24
+ - - "~>"
28
25
  - !ruby/object:Gem::Version
29
26
  version: '1.0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: bacon
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - ">="
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - ">="
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: bacon-colored_output
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - ">="
52
46
  - !ruby/object:Gem::Version
53
47
  version: '0'
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - ">="
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
62
55
  description: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8
@@ -67,8 +60,8 @@ executables: []
67
60
  extensions: []
68
61
  extra_rdoc_files: []
69
62
  files:
70
- - .gitignore
71
- - .travis.yml
63
+ - ".gitignore"
64
+ - ".travis.yml"
72
65
  - Gemfile
73
66
  - LICENSE.txt
74
67
  - README.md
@@ -78,27 +71,26 @@ files:
78
71
  - test/test_utf8_sanitizer.rb
79
72
  homepage: http://github.com/whitequark/rack-utf8_sanitizer
80
73
  licenses: []
74
+ metadata: {}
81
75
  post_install_message:
82
76
  rdoc_options: []
83
77
  require_paths:
84
78
  - lib
85
79
  required_ruby_version: !ruby/object:Gem::Requirement
86
- none: false
87
80
  requirements:
88
- - - ! '>='
81
+ - - ">="
89
82
  - !ruby/object:Gem::Version
90
83
  version: '1.9'
91
84
  required_rubygems_version: !ruby/object:Gem::Requirement
92
- none: false
93
85
  requirements:
94
- - - ! '>='
86
+ - - ">="
95
87
  - !ruby/object:Gem::Version
96
88
  version: '0'
97
89
  requirements: []
98
90
  rubyforge_project:
99
- rubygems_version: 1.8.23
91
+ rubygems_version: 2.2.2
100
92
  signing_key:
101
- specification_version: 3
93
+ specification_version: 4
102
94
  summary: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters
103
95
  in request URI and headers.
104
96
  test_files: