rack-utf8_sanitizer 1.8.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 045740a7d869543a26c071de26ee6585d00e6193eaa2a5a02bfe09142cfe11c1
4
- data.tar.gz: '050977cbbb72a835dea65e4df6bd75d6837c216b2a2f68eecd83701f1153e7ff'
3
+ metadata.gz: 7825c2fec2176e38043c4d7a3c1fcbe1cf112bcc7a17a7ef42b249fab30118c4
4
+ data.tar.gz: 5090e3c92af9a74377d559be48685d343b29315e5a9ce0f76faf36a8b96437ee
5
5
  SHA512:
6
- metadata.gz: 611f078bdbe5f0247eac8ba3258a174eb70a1d3263b49ca0d4d261a8fd3de1b260da9defc36bc644eb2e6805211038adddf4396a9f7b0bb48fbd932e41991f97
7
- data.tar.gz: 1bc7f43fbd004ac010a7829cb8077b2bdb245500670d2fc4d22372ccc44c959db47a3853aa68f1eb4c1533b5202ee6fdb44d368d722be7fa9439dc6049ddbef2
6
+ metadata.gz: e20607b2c412ecfb3d2ba719a7d0aeb381cc4f685e08c6a7801fb2b60a0993c71cc5c219b9ff17cedb497f7d5d0ee907da94ab91476960143f25c704058f1ebc
7
+ data.tar.gz: 7df7e1d357a6d3b12f089c1d7fea0a55eeb31d2d8f7e3d2b2e2e8729c1ae21c6260a9eb370e8c54e6886344986b1dd436d55b404f7321263d6bbf120115d1788
@@ -10,7 +10,7 @@ jobs:
10
10
  strategy:
11
11
  fail-fast: false
12
12
  matrix:
13
- ruby: ["2.5", "2.6", "2.7", "3.0", "3.1", ruby-head, jruby-9.2, jruby-9.3, jruby-head]
13
+ ruby: ["2.5", "2.6", "2.7", "3.0", "3.1", "3.2", ruby-head, jruby-9.2, jruby-9.3, jruby-head]
14
14
 
15
15
  steps:
16
16
  - uses: actions/checkout@v3
data/README.md CHANGED
@@ -113,7 +113,7 @@ config.middleware.insert 0, Rack::UTF8Sanitizer, strategy: :exception
113
113
  ```
114
114
 
115
115
  ```ruby
116
- replace_string = lambda do |_invalid|
116
+ replace_string = lambda do |_invalid, sanitize_null_bytes: false|
117
117
  Rails.logger.warn('Replacing invalid string')
118
118
 
119
119
  '<Bad Encoding>'.freeze
@@ -122,6 +122,10 @@ end
122
122
  config.middleware.insert 0, Rack::UTF8Sanitizer, strategy: replace_string
123
123
  ```
124
124
 
125
+ ### Sanitizing Null Bytes
126
+
127
+ While null bytes are valid UTF-8, it can be useful to further restrict the valid character set to exclude null bytes. For example, PostgreSQL text columns do not allow storing null bytes. Passing `sanitize_null_bytes: true` in the configuration hash enables sanitizing null bytes, and the two built-in strategies both support this feature. Custom strategies should accept a keyword argument `sanitize_null_bytes` containing this configuration value.
128
+
125
129
  ## Contributing
126
130
 
127
131
  1. Fork it
@@ -7,6 +7,9 @@ module Rack
7
7
  class UTF8Sanitizer
8
8
  StringIO = ::StringIO
9
9
  BAD_REQUEST = [400, { "Content-Type" => "text/plain" }, ["Bad Request"]]
10
+ NULL_BYTE_REGEX = /\x00/.freeze
11
+
12
+ class NullByteInString < StandardError; end
10
13
 
11
14
  # options[:sanitizable_content_types] Array
12
15
  # options[:additional_content_types] Array
@@ -17,6 +20,7 @@ module Rack
17
20
  @sanitizable_content_types ||= SANITIZABLE_CONTENT_TYPES + (options[:additional_content_types] || [])
18
21
  @only = Array(options[:only]).flatten
19
22
  @except = Array(options[:except]).flatten
23
+ @sanitize_null_bytes = options.fetch(:sanitize_null_bytes, false)
20
24
  end
21
25
 
22
26
  def call(env)
@@ -29,21 +33,29 @@ module Rack
29
33
  end
30
34
 
31
35
  DEFAULT_STRATEGIES = {
32
- replace: lambda do |input|
36
+ replace: lambda do |input, sanitize_null_bytes: false|
33
37
  input.
34
38
  force_encoding(Encoding::ASCII_8BIT).
35
39
  encode!(Encoding::UTF_8,
36
40
  invalid: :replace,
37
41
  undef: :replace)
42
+ if sanitize_null_bytes
43
+ input = input.gsub(NULL_BYTE_REGEX, "")
44
+ end
45
+ input
38
46
  end,
39
- exception: lambda do |input|
47
+ exception: lambda do |input, sanitize_null_bytes: false|
40
48
  input.
41
49
  force_encoding(Encoding::ASCII_8BIT).
42
50
  encode!(Encoding::UTF_8)
51
+ if sanitize_null_bytes && input =~ NULL_BYTE_REGEX
52
+ raise NullByteInString
53
+ end
54
+ input
43
55
  end
44
56
  }.freeze
45
57
 
46
- # http://rack.rubyforge.org/doc/SPEC.html
58
+ # https://github.com/rack/rack/blob/main/SPEC.rdoc
47
59
  URI_FIELDS = %w(
48
60
  SCRIPT_NAME
49
61
  REQUEST_PATH REQUEST_URI PATH_INFO
@@ -207,7 +219,8 @@ module Rack
207
219
 
208
220
  # This regexp matches all 'unreserved' characters from RFC3986 (2.3),
209
221
  # plus all multibyte UTF-8 characters.
210
- UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/
222
+ UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/.freeze
223
+ UNRESERVED_OR_UTF8_OR_NULL = /[A-Za-z0-9\-._~\x00\x80-\xFF]/.freeze
211
224
 
212
225
  # RFC3986, 2.2 states that the characters from 'reserved' group must be
213
226
  # protected during normalization (which is what UTF8Sanitizer does).
@@ -218,7 +231,8 @@ module Rack
218
231
  input.gsub(/%([a-f\d]{2})/i) do |encoded|
219
232
  decoded = $1.hex.chr
220
233
 
221
- if decoded =~ UNRESERVED_OR_UTF8
234
+ decodable_regex = @sanitize_null_bytes ? UNRESERVED_OR_UTF8_OR_NULL : UNRESERVED_OR_UTF8
235
+ if decoded =~ decodable_regex
222
236
  decoded
223
237
  else
224
238
  encoded
@@ -244,10 +258,10 @@ module Rack
244
258
  if input.is_a? String
245
259
  input = input.dup.force_encoding(Encoding::UTF_8)
246
260
 
247
- if input.valid_encoding?
261
+ if input.valid_encoding? && !(@sanitize_null_bytes && input =~ NULL_BYTE_REGEX)
248
262
  input
249
263
  else
250
- @strategy.call(input)
264
+ @strategy.call(input, sanitize_null_bytes: @sanitize_null_bytes)
251
265
  end
252
266
  else
253
267
  input
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "rack-utf8_sanitizer"
5
- gem.version = '1.8.0'
5
+ gem.version = '1.9.1'
6
6
  gem.authors = ["whitequark"]
7
7
  gem.license = "MIT"
8
8
  gem.email = ["whitequark@whitequark.org"]
@@ -337,6 +337,71 @@ describe Rack::UTF8Sanitizer do
337
337
  @response_env["CONTENT_LENGTH"].should == sanitized_input.bytesize.to_s
338
338
  end
339
339
  end
340
+
341
+ it "does not sanitize null bytes by default" do
342
+ input = "foo=bla&quux=bar%00"
343
+ @rack_input = StringIO.new input
344
+
345
+ sanitize_form_data do |sanitized_input|
346
+ sanitized_input.encoding.should == Encoding::UTF_8
347
+ sanitized_input.should.be.valid_encoding
348
+ sanitized_input.should == input
349
+ end
350
+ end
351
+
352
+ it "optionally sanitizes null bytes with the replace strategy" do
353
+ @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true)
354
+ input = "foo=bla\xED&quux=bar\x00"
355
+ @rack_input = StringIO.new input
356
+
357
+ sanitize_form_data do |sanitized_input|
358
+ sanitized_input.encoding.should == Encoding::UTF_8
359
+ sanitized_input.should.be.valid_encoding
360
+ sanitized_input.should == "foo=bla%EF%BF%BD&quux=bar"
361
+ end
362
+ end
363
+
364
+ it "optionally sanitizes encoded null bytes with the replace strategy" do
365
+ @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true)
366
+ input = "foo=bla%ED&quux=bar%00"
367
+ @rack_input = StringIO.new input
368
+
369
+ sanitize_form_data do |sanitized_input|
370
+ sanitized_input.encoding.should == Encoding::UTF_8
371
+ sanitized_input.should.be.valid_encoding
372
+ sanitized_input.should == "foo=bla%EF%BF%BD&quux=bar"
373
+ end
374
+ end
375
+
376
+ it "optionally raises on null bytes with the exception strategy" do
377
+ @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: :exception)
378
+ input = "foo=bla&quux=bar\x00"
379
+ @rack_input = StringIO.new input
380
+
381
+ should.raise(Rack::UTF8Sanitizer::NullByteInString) do
382
+ sanitize_form_data
383
+ end
384
+ end
385
+
386
+ it "optionally raises on encoded null bytes with the exception strategy" do
387
+ @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: :exception)
388
+ input = "foo=bla&quux=bar%00"
389
+ @rack_input = StringIO.new input
390
+
391
+ should.raise(Rack::UTF8Sanitizer::NullByteInString) do
392
+ sanitize_form_data
393
+ end
394
+ end
395
+
396
+ it "gives precedence to encoding errors with the exception strategy and null byte sanitisation" do
397
+ @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: :exception)
398
+ input = "foo=bla\x00&quux=bar\xED"
399
+ @rack_input = StringIO.new input
400
+
401
+ should.raise(EncodingError) do
402
+ sanitize_form_data
403
+ end
404
+ end
340
405
  end
341
406
 
342
407
  describe "with custom content-type" do
@@ -552,7 +617,10 @@ describe Rack::UTF8Sanitizer do
552
617
  end
553
618
 
554
619
  it "accepts a proc as a strategy" do
555
- truncate = -> input { 'replace'.force_encoding(Encoding::UTF_8) }
620
+ truncate = -> (input, sanitize_null_bytes:) do
621
+ sanitize_null_bytes.should == false
622
+ 'replace'.force_encoding(Encoding::UTF_8)
623
+ end
556
624
 
557
625
  @app = Rack::UTF8Sanitizer.new(-> env { env }, strategy: truncate)
558
626
 
@@ -566,5 +634,24 @@ describe Rack::UTF8Sanitizer do
566
634
  sanitized_input.should == 'replace'
567
635
  end
568
636
  end
637
+
638
+ it "accepts a proc as a strategy and passes along sanitize_null_bytes" do
639
+ truncate = -> (input, sanitize_null_bytes:) do
640
+ sanitize_null_bytes.should == true
641
+ 'replace'.force_encoding(Encoding::UTF_8)
642
+ end
643
+
644
+ @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true, strategy: truncate)
645
+ input = "foo=bla&quux=bar\x00"
646
+
647
+ @rack_input = StringIO.new input
648
+
649
+ env = request_env
650
+ sanitize_data(env) do |sanitized_input|
651
+ sanitized_input.encoding.should == Encoding::UTF_8
652
+ sanitized_input.should.be.valid_encoding
653
+ sanitized_input.should == 'replace'
654
+ end
655
+ end
569
656
  end
570
657
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rack-utf8_sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.8.0
4
+ version: 1.9.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - whitequark
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-25 00:00:00.000000000 Z
11
+ date: 2023-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
@@ -97,7 +97,7 @@ homepage: http://github.com/whitequark/rack-utf8_sanitizer
97
97
  licenses:
98
98
  - MIT
99
99
  metadata: {}
100
- post_install_message:
100
+ post_install_message:
101
101
  rdoc_options: []
102
102
  require_paths:
103
103
  - lib
@@ -112,8 +112,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
112
112
  - !ruby/object:Gem::Version
113
113
  version: '0'
114
114
  requirements: []
115
- rubygems_version: 3.2.5
116
- signing_key:
115
+ rubygems_version: 3.3.15
116
+ signing_key:
117
117
  specification_version: 4
118
118
  summary: Rack::UTF8Sanitizer is a Rack middleware which cleans up invalid UTF8 characters
119
119
  in request URI and headers.