rack-utf8_sanitizer 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,3 +9,7 @@ rvm:
9
9
 
10
10
  script:
11
11
  - rake spec
12
+
13
+ matrix:
14
+ allow_failures:
15
+ - rvm: jruby-19mode
data/README.md CHANGED
@@ -18,11 +18,15 @@ Or install it yourself as:
18
18
 
19
19
  For Rails, add this to your `application.rb`:
20
20
 
21
- config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
21
+ ``` ruby
22
+ config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
23
+ ```
22
24
 
23
25
  For Rack apps, add this to `config.ru`:
24
26
 
25
- use Rack::UTF8Sanitizer
27
+ ``` ruby
28
+ use Rack::UTF8Sanitizer
29
+ ```
26
30
 
27
31
  ## Usage
28
32
 
@@ -1,3 +1,5 @@
1
+ # encoding: ascii-8bit
2
+
1
3
  require 'uri'
2
4
 
3
5
  module Rack
@@ -30,12 +32,13 @@ module Rack
30
32
  #
31
33
  # The result is guaranteed to be UTF-8-safe.
32
34
 
33
- decoded_value = URI.decode(
35
+ decoded_value = unescape_unreserved(
34
36
  sanitize_string(value).
35
37
  force_encoding('ASCII-8BIT'))
36
38
 
37
39
  env[key] = transfer_frozen(value,
38
- URI.encode(sanitize_string(decoded_value)))
40
+ escape_unreserved(
41
+ sanitize_string(decoded_value)))
39
42
 
40
43
  elsif key =~ /^HTTP_/
41
44
  # Just sanitize the headers and leave them in UTF-8. There is
@@ -49,6 +52,41 @@ module Rack
49
52
 
50
53
  protected
51
54
 
55
+ # This regexp matches all 'unreserved' characters from RFC3986 (2.3),
56
+ # plus all multibyte UTF-8 characters.
57
+ UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/
58
+
59
+ # RFC3986, 2.2 states that the characters from 'reserved' group must be
60
+ # protected during normalization (which is what UTF8Sanitizer does).
61
+ #
62
+ # However, the regexp approach used by URI.unescape is not sophisticated
63
+ # enough for our task.
64
+ def unescape_unreserved(input)
65
+ input.gsub(/%([a-f\d]{2})/i) do |encoded|
66
+ decoded = [$1.hex].pack('C')
67
+
68
+ if decoded =~ UNRESERVED_OR_UTF8
69
+ decoded
70
+ else
71
+ encoded
72
+ end
73
+ end
74
+ end
75
+
76
+ # This regexp matches unsafe characters, i.e. everything except 'reserved'
77
+ # and 'unreserved' characters from RFC3986 (2.3), and additionally '%',
78
+ # as percent-encoded unreserved characters could be left over from the
79
+ # `unescape_unreserved` invocation.
80
+ #
81
+ # See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}.
82
+ UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/
83
+
84
+ # Performs the reverse function of `unescape_unreserved`. Unlike
85
+ # the previous function, we can reuse the logic in URI#escape.
86
+ def escape_unreserved(input)
87
+ URI.escape(input, UNSAFE)
88
+ end
89
+
52
90
  def sanitize_string(input)
53
91
  if input.is_a? String
54
92
  input = input.dup.force_encoding('UTF-8')
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "rack-utf8_sanitizer"
5
- gem.version = '1.0.0'
5
+ gem.version = '1.1.0'
6
6
  gem.authors = ["Peter Zotov"]
7
7
  gem.email = ["whitequark@whitequark.org"]
8
8
  gem.description = %{Rack::UTF8Sanitizer is a Rack middleware which cleans up } <<
@@ -31,7 +31,7 @@ describe Rack::UTF8Sanitizer do
31
31
  describe "with invalid UTF-8 input" do
32
32
  before do
33
33
  @plain_input = "foo\xe0".force_encoding('UTF-8')
34
- @uri_input = "foo%E0".force_encoding('UTF-8')
34
+ @uri_input = "http://bar/foo%E0".force_encoding('UTF-8')
35
35
  end
36
36
 
37
37
  behaves_like :does_sanitize_plain
@@ -40,7 +40,7 @@ describe Rack::UTF8Sanitizer do
40
40
 
41
41
  describe "with invalid, incorrectly percent-encoded UTF-8 URI input" do
42
42
  before do
43
- @uri_input = "foo%E0\xe0".force_encoding('UTF-8')
43
+ @uri_input = "http://bar/foo%E0\xe0".force_encoding('UTF-8')
44
44
  end
45
45
 
46
46
  behaves_like :does_sanitize_uri
@@ -49,7 +49,7 @@ describe Rack::UTF8Sanitizer do
49
49
  describe "with invalid ASCII-8BIT input" do
50
50
  before do
51
51
  @plain_input = "foo\xe0"
52
- @uri_input = "foo%E0"
52
+ @uri_input = "http://bar/foo%E0"
53
53
  end
54
54
 
55
55
  behaves_like :does_sanitize_plain
@@ -58,7 +58,7 @@ describe Rack::UTF8Sanitizer do
58
58
 
59
59
  describe "with invalid, incorrectly percent-encoded ASCII-8BIT URI input" do
60
60
  before do
61
- @uri_input = "foo%E0\xe0"
61
+ @uri_input = "http://bar/foo%E0\xe0"
62
62
  end
63
63
 
64
64
  behaves_like :does_sanitize_uri
@@ -89,16 +89,24 @@ describe Rack::UTF8Sanitizer do
89
89
  describe "with valid UTF-8 input" do
90
90
  before do
91
91
  @plain_input = "foo bar лол".force_encoding('UTF-8')
92
- @uri_input = "foo+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
92
+ @uri_input = "http://bar/foo+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
93
93
  end
94
94
 
95
95
  behaves_like :identity_plain
96
96
  behaves_like :identity_uri
97
+
98
+ describe "with URI characters from reserved range" do
99
+ before do
100
+ @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
101
+ end
102
+
103
+ behaves_like :identity_uri
104
+ end
97
105
  end
98
106
 
99
107
  describe "with valid, not percent-encoded UTF-8 URI input" do
100
108
  before do
101
- @uri_input = "foo+bar+лол".force_encoding('UTF-8')
109
+ @uri_input = "http://bar/foo+bar+лол".force_encoding('UTF-8')
102
110
  end
103
111
 
104
112
  it "does not change URI-like entity (REQUEST_PATH)" do
@@ -114,17 +122,25 @@ describe Rack::UTF8Sanitizer do
114
122
  describe "with valid ASCII-8BIT input" do
115
123
  before do
116
124
  @plain_input = "bar baz"
117
- @uri_input = "bar+baz"
125
+ @uri_input = "http://bar/bar+baz"
118
126
  end
119
127
 
120
128
  behaves_like :identity_plain
121
129
  behaves_like :identity_uri
130
+
131
+ describe "with URI characters from reserved range" do
132
+ before do
133
+ @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB"
134
+ end
135
+
136
+ behaves_like :identity_uri
137
+ end
122
138
  end
123
139
 
124
140
  describe "with frozen strings" do
125
141
  before do
126
142
  @plain_input = "bar baz".freeze
127
- @uri_input = "bar+baz".freeze
143
+ @uri_input = "http://bar/bar+baz".freeze
128
144
  end
129
145
 
130
146
  it "preserves the frozen? status of input" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rack-utf8_sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-05 00:00:00.000000000 Z
12
+ date: 2013-03-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rack