utf8-cleaner 0.0.4 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +4 -0
- data/Guardfile +9 -0
- data/README.md +2 -0
- data/Rakefile +4 -0
- data/lib/utf8-cleaner.rb +1 -0
- data/lib/utf8-cleaner/middleware.rb +13 -32
- data/lib/utf8-cleaner/uri_string.rb +112 -0
- data/lib/utf8-cleaner/version.rb +1 -1
- data/spec/middleware_spec.rb +9 -16
- data/spec/spec_helper.rb +3 -0
- data/spec/uri_string_spec.rb +45 -0
- data/utf8-cleaner.gemspec +3 -0
- metadata +56 -13
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b6e265c0aefe9f9c6fa6082b86f21c81a92400b6
|
4
|
+
data.tar.gz: 12dd12dceef125122d2877dbc9e9e597055ddfac
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b794d47e0c9460ef5ec9eadc9340f705a491b0c48370efb82310d2bf21ff7713d655a4fdbb312c4788913879476be5a4c8823e8f494d8a2fe7376ecc4dd58a6c
|
7
|
+
data.tar.gz: 0bf1fb1a6d23600a3f5ff4ae7d1f62c7fded5a0d7d0233ebc85103b12c51a849a7ef83708a21568a28bd005e8ea1199ffff6b0d05b0c186a8889624e190f5e19
|
data/.travis.yml
ADDED
data/Guardfile
ADDED
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# UTF8Cleaner
|
2
2
|
|
3
|
+
[<img src="https://secure.travis-ci.org/singlebrook/utf8-cleaner.png" />](http://travis-ci.org/singlebrook/utf8-cleaner)
|
4
|
+
|
3
5
|
Removes invalid UTF-8 characters from the environment so that your app doesn't choke
|
4
6
|
on them. This prevents errors like "invalid byte sequence in UTF-8".
|
5
7
|
|
data/Rakefile
CHANGED
data/lib/utf8-cleaner.rb
CHANGED
@@ -7,6 +7,7 @@ module UTF8Cleaner
|
|
7
7
|
"QUERY_STRING",
|
8
8
|
"REQUEST_PATH",
|
9
9
|
"REQUEST_URI",
|
10
|
+
"HTTP_COOKIE"
|
10
11
|
]
|
11
12
|
|
12
13
|
def initialize(app)
|
@@ -19,42 +20,22 @@ module UTF8Cleaner
|
|
19
20
|
|
20
21
|
private
|
21
22
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
value = sanitize_string(URI.decode(value))
|
26
|
-
env[key] = URI.encode(value)
|
27
|
-
end
|
28
|
-
["HTTP_COOKIE"].each do |key|
|
29
|
-
next unless value = env[key]
|
30
|
-
fixed = sanitize_string(value)
|
31
|
-
env[key] = fixed if fixed
|
32
|
-
end
|
33
|
-
env
|
34
|
-
end
|
35
|
-
|
36
|
-
def sanitize_string(string)
|
37
|
-
return string unless string.is_a? String
|
38
|
-
|
39
|
-
# Try it as UTF-8 directly
|
40
|
-
cleaned = string.dup.force_encoding('UTF-8')
|
41
|
-
if cleaned.valid_encoding?
|
42
|
-
cleaned
|
43
|
-
else
|
44
|
-
utf8clean(string)
|
45
|
-
end
|
23
|
+
def is_valid_utf8(string)
|
24
|
+
utf8 = string.dup.force_encoding('UTF-8')
|
25
|
+
string == utf8 && utf8.valid_encoding?
|
46
26
|
rescue EncodingError
|
47
|
-
|
27
|
+
false
|
48
28
|
end
|
49
29
|
|
50
|
-
def
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
30
|
+
def sanitize_env(env)
|
31
|
+
SANITIZE_ENV_KEYS.each do |key|
|
32
|
+
next unless value = env[key]
|
33
|
+
|
34
|
+
if value.include?('%')
|
35
|
+
env[key] = URIString.new(value).cleaned
|
36
|
+
end
|
57
37
|
end
|
38
|
+
env
|
58
39
|
end
|
59
40
|
end
|
60
41
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module UTF8Cleaner
|
2
|
+
class URIString
|
3
|
+
attr_accessor :data
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
self.data = data
|
7
|
+
end
|
8
|
+
|
9
|
+
def cleaned
|
10
|
+
if valid?
|
11
|
+
data
|
12
|
+
else
|
13
|
+
encoded_char_array.join
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def encoded?
|
18
|
+
data.include?('%')
|
19
|
+
end
|
20
|
+
|
21
|
+
def valid?
|
22
|
+
valid_uri_encoded_utf8(data)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# Returns an array of valid URI-encoded UTF-8 characters.
|
28
|
+
def encoded_char_array
|
29
|
+
char_array = []
|
30
|
+
index = 0
|
31
|
+
|
32
|
+
while (index < data.length) do
|
33
|
+
char = data[index]
|
34
|
+
|
35
|
+
if char == '%'
|
36
|
+
# Skip the next two characters, which are the encoded byte
|
37
|
+
# indicates by this %. (We'll change this later for multibyte characters.)
|
38
|
+
skip_next = 2
|
39
|
+
|
40
|
+
# How long is this character?
|
41
|
+
first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
|
42
|
+
bytes = utf8_char_length_in_bytes(first_byte)
|
43
|
+
|
44
|
+
# Grab the specified number of encoded bytes
|
45
|
+
utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)
|
46
|
+
|
47
|
+
# Did we get the right number of bytes?
|
48
|
+
if utf8_char_encoded_bytes.length == bytes
|
49
|
+
|
50
|
+
# We did. Is it a valid character?
|
51
|
+
utf8_char_encoded = utf8_char_encoded_bytes.join
|
52
|
+
|
53
|
+
if valid_uri_encoded_utf8(utf8_char_encoded)
|
54
|
+
# It's valid!
|
55
|
+
char_array << utf8_char_encoded
|
56
|
+
|
57
|
+
# If we're dealing with a multibyte character, skip more than two
|
58
|
+
# of the next characters, which have already been processed.
|
59
|
+
skip_next = bytes * 3 - 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
index += skip_next
|
63
|
+
else
|
64
|
+
# This was not an encoded character, so just add it and move to the next.
|
65
|
+
char_array << char
|
66
|
+
end
|
67
|
+
index += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
char_array
|
71
|
+
end
|
72
|
+
|
73
|
+
def valid_uri_encoded_utf8(string)
|
74
|
+
URI.decode(string).force_encoding('UTF-8').valid_encoding?
|
75
|
+
end
|
76
|
+
|
77
|
+
# Grab the next num_bytes URI-encoded bytes from the raw character array.
|
78
|
+
# Returns an array like ['%E2', '%9C', '%93']
|
79
|
+
def next_n_bytes_from(index, num_bytes)
|
80
|
+
return [] if data.length < index + (3 * num_bytes)
|
81
|
+
|
82
|
+
num_bytes.times.map do |n|
|
83
|
+
# Look for percent signs in the right places
|
84
|
+
pct_index = index + (3 * n)
|
85
|
+
if data[pct_index] == '%'
|
86
|
+
byte = data[pct_index + 1..pct_index + 2]
|
87
|
+
else
|
88
|
+
# An expected percent sign was missing. The whole character is invalid.
|
89
|
+
return []
|
90
|
+
end
|
91
|
+
'%' + byte
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes;
|
96
|
+
# if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
|
97
|
+
# and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
|
98
|
+
# first_byte is a string like "0x13"
|
99
|
+
def utf8_char_length_in_bytes(first_byte)
|
100
|
+
if first_byte.hex < 'C0'.hex
|
101
|
+
1
|
102
|
+
elsif first_byte.hex < 'DF'.hex
|
103
|
+
2
|
104
|
+
elsif first_byte.hex < 'EF'.hex
|
105
|
+
3
|
106
|
+
else
|
107
|
+
4
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
data/lib/utf8-cleaner/version.rb
CHANGED
data/spec/middleware_spec.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
require 'spec_helper'
|
3
2
|
|
4
|
-
describe
|
3
|
+
describe UTF8Cleaner::Middleware do
|
5
4
|
let :env do
|
6
5
|
{
|
7
|
-
'PATH_INFO' => 'foo
|
6
|
+
'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
|
8
7
|
'QUERY_STRING' => 'foo=bar%FF',
|
9
8
|
'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
|
10
|
-
'REQUEST_URI' => '%C3%89'
|
9
|
+
'REQUEST_URI' => '%C3%89%E2%9C%93'
|
11
10
|
}
|
12
11
|
end
|
13
12
|
|
@@ -15,19 +14,13 @@ describe 'UTF8Cleaner::Middleware' do
|
|
15
14
|
UTF8Cleaner::Middleware.new(nil).send(:sanitize_env, env)
|
16
15
|
end
|
17
16
|
|
18
|
-
|
19
|
-
new_env['QUERY_STRING'].should == 'foo=bar'
|
17
|
+
describe "removes invalid UTF-8 sequences" do
|
18
|
+
it { new_env['QUERY_STRING'].should == 'foo=bar' }
|
19
|
+
it { new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++' }
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
new_env['PATH_INFO'].should == 'foo/bar
|
24
|
-
|
25
|
-
|
26
|
-
it "leaves valid %-escaped UTF-8 chars alone" do
|
27
|
-
new_env['REQUEST_URI'].should == '%C3%89'
|
28
|
-
end
|
29
|
-
|
30
|
-
it "handles an awful URL" do
|
31
|
-
new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++'
|
22
|
+
describe "leaves all valid characters untouched" do
|
23
|
+
it { new_env['PATH_INFO'].should == 'foo/bar%2e%2fbaz%26%3B' }
|
24
|
+
it { new_env['REQUEST_URI'].should == '%C3%89%E2%9C%93' }
|
32
25
|
end
|
33
26
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -2,8 +2,11 @@ require 'rubygems'
|
|
2
2
|
require 'rspec/autorun'
|
3
3
|
|
4
4
|
require 'utf8-cleaner'
|
5
|
+
require 'uri'
|
5
6
|
|
6
7
|
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
|
7
10
|
# Run specs in random order to surface order dependencies. If you find an
|
8
11
|
# order dependency and want to debug it, you can fix the order by providing
|
9
12
|
# the seed, which is printed after each run.
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module UTF8Cleaner
|
4
|
+
|
5
|
+
describe URIString do
|
6
|
+
let(:invalid_string) { URIString.new('%FF') }
|
7
|
+
let(:ascii_string) { URIString.new('foo') }
|
8
|
+
let(:encoded_string) { URIString.new('%26') }
|
9
|
+
let(:multibyte_string) { URIString.new('%E2%9C%93') }
|
10
|
+
let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
|
11
|
+
# foo/ bar. / baz& ; √ baz
|
12
|
+
|
13
|
+
describe '#new' do
|
14
|
+
it { encoded_string.should be_a URIString }
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '#cleaned' do
|
18
|
+
it { invalid_string.cleaned.should eq('') }
|
19
|
+
it { ascii_string.cleaned.should eq('foo') }
|
20
|
+
it { encoded_string.cleaned.should eq('%26') }
|
21
|
+
it { multibyte_string.cleaned.should eq('%E2%9C%93') }
|
22
|
+
it { complex_invalid_string.cleaned.should eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#encoded?' do
|
26
|
+
it { encoded_string.should be_encoded }
|
27
|
+
it { invalid_string.should be_encoded }
|
28
|
+
it { multibyte_string.should be_encoded }
|
29
|
+
it { complex_invalid_string.should be_encoded }
|
30
|
+
|
31
|
+
it { ascii_string.should_not be_encoded }
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#valid?' do
|
35
|
+
it { ascii_string.should be_valid }
|
36
|
+
it { encoded_string.should be_valid }
|
37
|
+
it { multibyte_string.should be_valid }
|
38
|
+
|
39
|
+
it { invalid_string.should_not be_valid }
|
40
|
+
it { complex_invalid_string.should_not be_valid }
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
data/utf8-cleaner.gemspec
CHANGED
@@ -17,5 +17,8 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
19
|
|
20
|
+
gem.add_development_dependency "rake"
|
21
|
+
gem.add_development_dependency "guard"
|
22
|
+
gem.add_development_dependency "guard-rspec"
|
20
23
|
gem.add_development_dependency "rspec"
|
21
24
|
end
|
metadata
CHANGED
@@ -1,30 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8-cleaner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.6
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Leon Miller-Out
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-10-16 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: guard
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: guard-rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
14
55
|
- !ruby/object:Gem::Dependency
|
15
56
|
name: rspec
|
16
57
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
58
|
requirements:
|
19
|
-
- -
|
59
|
+
- - '>='
|
20
60
|
- !ruby/object:Gem::Version
|
21
61
|
version: '0'
|
22
62
|
type: :development
|
23
63
|
prerelease: false
|
24
64
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
65
|
requirements:
|
27
|
-
- -
|
66
|
+
- - '>='
|
28
67
|
- !ruby/object:Gem::Version
|
29
68
|
version: '0'
|
30
69
|
description: Removes invalid UTF8 characters from the URL and other env vars
|
@@ -35,41 +74,45 @@ extensions: []
|
|
35
74
|
extra_rdoc_files: []
|
36
75
|
files:
|
37
76
|
- .gitignore
|
77
|
+
- .travis.yml
|
38
78
|
- Gemfile
|
79
|
+
- Guardfile
|
39
80
|
- LICENSE.txt
|
40
81
|
- README.md
|
41
82
|
- Rakefile
|
42
83
|
- lib/utf8-cleaner.rb
|
43
84
|
- lib/utf8-cleaner/middleware.rb
|
44
85
|
- lib/utf8-cleaner/railtie.rb
|
86
|
+
- lib/utf8-cleaner/uri_string.rb
|
45
87
|
- lib/utf8-cleaner/version.rb
|
46
88
|
- spec/middleware_spec.rb
|
47
89
|
- spec/spec_helper.rb
|
90
|
+
- spec/uri_string_spec.rb
|
48
91
|
- utf8-cleaner.gemspec
|
49
92
|
homepage: https://github.com/singlebrook/utf8-cleaner
|
50
93
|
licenses: []
|
94
|
+
metadata: {}
|
51
95
|
post_install_message:
|
52
96
|
rdoc_options: []
|
53
97
|
require_paths:
|
54
98
|
- lib
|
55
99
|
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
-
none: false
|
57
100
|
requirements:
|
58
|
-
- -
|
101
|
+
- - '>='
|
59
102
|
- !ruby/object:Gem::Version
|
60
103
|
version: '0'
|
61
104
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
-
none: false
|
63
105
|
requirements:
|
64
|
-
- -
|
106
|
+
- - '>='
|
65
107
|
- !ruby/object:Gem::Version
|
66
108
|
version: '0'
|
67
109
|
requirements: []
|
68
110
|
rubyforge_project:
|
69
|
-
rubygems_version:
|
111
|
+
rubygems_version: 2.0.3
|
70
112
|
signing_key:
|
71
|
-
specification_version:
|
113
|
+
specification_version: 4
|
72
114
|
summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
|
73
115
|
test_files:
|
74
116
|
- spec/middleware_spec.rb
|
75
117
|
- spec/spec_helper.rb
|
118
|
+
- spec/uri_string_spec.rb
|