rack-utf8_sanitizer 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,3 +9,7 @@ rvm:
9
9
 
10
10
  script:
11
11
  - rake spec
12
+
13
+ matrix:
14
+ allow_failures:
15
+ - rvm: jruby-19mode
data/README.md CHANGED
@@ -18,11 +18,15 @@ Or install it yourself as:
18
18
 
19
19
  For Rails, add this to your `application.rb`:
20
20
 
21
- config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
21
+ ``` ruby
22
+ config.middleware.insert_before "Rack::Lock", Rack::UTF8Sanitizer
23
+ ```
22
24
 
23
25
  For Rack apps, add this to `config.ru`:
24
26
 
25
- use Rack::UTF8Sanitizer
27
+ ``` ruby
28
+ use Rack::UTF8Sanitizer
29
+ ```
26
30
 
27
31
  ## Usage
28
32
 
@@ -1,3 +1,5 @@
1
+ # encoding: ascii-8bit
2
+
1
3
  require 'uri'
2
4
 
3
5
  module Rack
@@ -30,12 +32,13 @@ module Rack
30
32
  #
31
33
  # The result is guaranteed to be UTF-8-safe.
32
34
 
33
- decoded_value = URI.decode(
35
+ decoded_value = unescape_unreserved(
34
36
  sanitize_string(value).
35
37
  force_encoding('ASCII-8BIT'))
36
38
 
37
39
  env[key] = transfer_frozen(value,
38
- URI.encode(sanitize_string(decoded_value)))
40
+ escape_unreserved(
41
+ sanitize_string(decoded_value)))
39
42
 
40
43
  elsif key =~ /^HTTP_/
41
44
  # Just sanitize the headers and leave them in UTF-8. There is
@@ -49,6 +52,41 @@ module Rack
49
52
 
50
53
  protected
51
54
 
55
+ # This regexp matches all 'unreserved' characters from RFC3986 (2.3),
56
+ # plus all multibyte UTF-8 characters.
57
+ UNRESERVED_OR_UTF8 = /[A-Za-z0-9\-._~\x80-\xFF]/
58
+
59
+ # RFC3986, 2.2 states that the characters from 'reserved' group must be
60
+ # protected during normalization (which is what UTF8Sanitizer does).
61
+ #
62
+ # However, the regexp approach used by URI.unescape is not sophisticated
63
+ # enough for our task.
64
+ def unescape_unreserved(input)
65
+ input.gsub(/%([a-f\d]{2})/i) do |encoded|
66
+ decoded = [$1.hex].pack('C')
67
+
68
+ if decoded =~ UNRESERVED_OR_UTF8
69
+ decoded
70
+ else
71
+ encoded
72
+ end
73
+ end
74
+ end
75
+
76
+ # This regexp matches unsafe characters, i.e. everything except 'reserved'
77
+ # and 'unreserved' characters from RFC3986 (2.3), and additionally '%',
78
+ # as percent-encoded unreserved characters could be left over from the
79
+ # `unescape_unreserved` invocation.
80
+ #
81
+ # See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}.
82
+ UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/
83
+
84
+ # Performs the reverse function of `unescape_unreserved`. Unlike
85
+ # the previous function, we can reuse the logic in URI#escape.
86
+ def escape_unreserved(input)
87
+ URI.escape(input, UNSAFE)
88
+ end
89
+
52
90
  def sanitize_string(input)
53
91
  if input.is_a? String
54
92
  input = input.dup.force_encoding('UTF-8')
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = "rack-utf8_sanitizer"
5
- gem.version = '1.0.0'
5
+ gem.version = '1.1.0'
6
6
  gem.authors = ["Peter Zotov"]
7
7
  gem.email = ["whitequark@whitequark.org"]
8
8
  gem.description = %{Rack::UTF8Sanitizer is a Rack middleware which cleans up } <<
@@ -31,7 +31,7 @@ describe Rack::UTF8Sanitizer do
31
31
  describe "with invalid UTF-8 input" do
32
32
  before do
33
33
  @plain_input = "foo\xe0".force_encoding('UTF-8')
34
- @uri_input = "foo%E0".force_encoding('UTF-8')
34
+ @uri_input = "http://bar/foo%E0".force_encoding('UTF-8')
35
35
  end
36
36
 
37
37
  behaves_like :does_sanitize_plain
@@ -40,7 +40,7 @@ describe Rack::UTF8Sanitizer do
40
40
 
41
41
  describe "with invalid, incorrectly percent-encoded UTF-8 URI input" do
42
42
  before do
43
- @uri_input = "foo%E0\xe0".force_encoding('UTF-8')
43
+ @uri_input = "http://bar/foo%E0\xe0".force_encoding('UTF-8')
44
44
  end
45
45
 
46
46
  behaves_like :does_sanitize_uri
@@ -49,7 +49,7 @@ describe Rack::UTF8Sanitizer do
49
49
  describe "with invalid ASCII-8BIT input" do
50
50
  before do
51
51
  @plain_input = "foo\xe0"
52
- @uri_input = "foo%E0"
52
+ @uri_input = "http://bar/foo%E0"
53
53
  end
54
54
 
55
55
  behaves_like :does_sanitize_plain
@@ -58,7 +58,7 @@ describe Rack::UTF8Sanitizer do
58
58
 
59
59
  describe "with invalid, incorrectly percent-encoded ASCII-8BIT URI input" do
60
60
  before do
61
- @uri_input = "foo%E0\xe0"
61
+ @uri_input = "http://bar/foo%E0\xe0"
62
62
  end
63
63
 
64
64
  behaves_like :does_sanitize_uri
@@ -89,16 +89,24 @@ describe Rack::UTF8Sanitizer do
89
89
  describe "with valid UTF-8 input" do
90
90
  before do
91
91
  @plain_input = "foo bar лол".force_encoding('UTF-8')
92
- @uri_input = "foo+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
92
+ @uri_input = "http://bar/foo+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
93
93
  end
94
94
 
95
95
  behaves_like :identity_plain
96
96
  behaves_like :identity_uri
97
+
98
+ describe "with URI characters from reserved range" do
99
+ before do
100
+ @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB".force_encoding('UTF-8')
101
+ end
102
+
103
+ behaves_like :identity_uri
104
+ end
97
105
  end
98
106
 
99
107
  describe "with valid, not percent-encoded UTF-8 URI input" do
100
108
  before do
101
- @uri_input = "foo+bar+лол".force_encoding('UTF-8')
109
+ @uri_input = "http://bar/foo+bar+лол".force_encoding('UTF-8')
102
110
  end
103
111
 
104
112
  it "does not change URI-like entity (REQUEST_PATH)" do
@@ -114,17 +122,25 @@ describe Rack::UTF8Sanitizer do
114
122
  describe "with valid ASCII-8BIT input" do
115
123
  before do
116
124
  @plain_input = "bar baz"
117
- @uri_input = "bar+baz"
125
+ @uri_input = "http://bar/bar+baz"
118
126
  end
119
127
 
120
128
  behaves_like :identity_plain
121
129
  behaves_like :identity_uri
130
+
131
+ describe "with URI characters from reserved range" do
132
+ before do
133
+ @uri_input = "http://bar/foo+%2F%3A+bar+%D0%BB%D0%BE%D0%BB"
134
+ end
135
+
136
+ behaves_like :identity_uri
137
+ end
122
138
  end
123
139
 
124
140
  describe "with frozen strings" do
125
141
  before do
126
142
  @plain_input = "bar baz".freeze
127
- @uri_input = "bar+baz".freeze
143
+ @uri_input = "http://bar/bar+baz".freeze
128
144
  end
129
145
 
130
146
  it "preserves the frozen? status of input" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rack-utf8_sanitizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-05 00:00:00.000000000 Z
12
+ date: 2013-03-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rack