utf8_utils 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +20 -8
- data/lib/utf8_utils.rb +44 -43
- data/lib/utf8_utils/version.rb +1 -1
- data/test/utf8_utils_test.rb +54 -7
- metadata +4 -15
data/README.md
CHANGED
@@ -11,7 +11,13 @@ access at [its home on Github](github.com/norman/utf8_utils).
|
|
11
11
|
|
12
12
|
## The Problem
|
13
13
|
|
14
|
-
|
14
|
+
Your application may have to deal with invalid UTF-8 strings that come from
|
15
|
+
user input that is copied and pasted from Microsoft Word, and includes
|
16
|
+
Windows-encoded "smart quotes," or other characters. This is only one scenario;
|
17
|
+
there are many ways your application could receive such input.
|
18
|
+
|
19
|
+
Here's what happens when you try to access a string with invalid UTF-8
|
20
|
+
characters in Ruby 1.9:
|
15
21
|
|
16
22
|
ruby-1.9.1-p378 > "my messed up \x92 string".split(//u)
|
17
23
|
ArgumentError: invalid byte sequence in UTF-8
|
@@ -19,24 +25,30 @@ Here's what happens when you try to access a string with invalid UTF-8 character
|
|
19
25
|
from (irb):3
|
20
26
|
from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
|
21
27
|
|
28
|
+
Ruby is quite particular about this - accessing the data in the string is
|
29
|
+
difficult as almost all string access methods will die with this error.
|
30
|
+
|
22
31
|
## The Solution
|
23
32
|
|
33
|
+
This library breaks the string down into an array of raw bytes, and cleans up
|
34
|
+
the ones that are impossible UTF-8 sequences.
|
35
|
+
|
24
36
|
ruby-1.9.1-p378 > "my messed up \x92 string".tidy_bytes.split(//u)
|
25
37
|
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
26
38
|
|
27
|
-
Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
28
|
-
characters,
|
29
|
-
a bad assumption, but may not always work.
|
39
|
+
Note that, like ActiveSupport, it naively assumes if you have invalid UTF8
|
40
|
+
characters, their encoding is either Windows CP1252 or ISO-8859-1. In practice
|
41
|
+
this isn't a bad assumption, but may not always work.
|
30
42
|
|
31
43
|
This library's `tidy_bytes` method is a little less than twice as fast as the
|
32
44
|
one provided by ActiveSupport:
|
33
45
|
|
46
|
+
|
34
47
|
| ACTIVE_SUPPORT | UTF8_UTILS |
|
35
48
|
----------------------------------------------------------
|
36
|
-
tidy bytes x20000 | 1.
|
49
|
+
tidy bytes x20000 | 1.004 | 0.607 |
|
37
50
|
==========================================================
|
38
|
-
Total | 1.
|
39
|
-
|
51
|
+
Total | 1.004 | 0.607 |
|
40
52
|
|
41
53
|
## Getting it
|
42
54
|
|
@@ -62,4 +74,4 @@ one provided by ActiveSupport:
|
|
62
74
|
|
63
75
|
Created by Norman Clarke.
|
64
76
|
|
65
|
-
Copyright (c) 2010, released under the MIT license.
|
77
|
+
Copyright (c) 2010, released under the MIT license.
|
data/lib/utf8_utils.rb
CHANGED
@@ -45,49 +45,55 @@ module UTF8Utils
|
|
45
45
|
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
46
46
|
# CP1251 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
47
47
|
# always work.
|
48
|
-
|
48
|
+
#
|
49
|
+
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
50
|
+
# encoding is CP1252 or ISO-8859-1.
|
51
|
+
def tidy_bytes(force = false)
|
49
52
|
|
50
|
-
|
51
|
-
|
53
|
+
if force
|
54
|
+
return unpack("C*").map do |b|
|
55
|
+
tidy_byte(b)
|
56
|
+
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
57
|
+
end
|
52
58
|
|
53
|
-
bytes
|
59
|
+
bytes = unpack("C*")
|
60
|
+
conts_expected = 0
|
61
|
+
last_lead = 0
|
54
62
|
|
55
|
-
|
63
|
+
bytes.each_index do |i|
|
56
64
|
|
57
|
-
|
58
|
-
|
59
|
-
|
65
|
+
byte = bytes[i]
|
66
|
+
is_ascii = byte < 128
|
67
|
+
is_cont = byte > 127 && byte < 192
|
68
|
+
is_lead = byte > 191 && byte < 245
|
69
|
+
is_unused = byte > 240
|
70
|
+
is_restricted = byte > 244
|
60
71
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
continuation_bytes_expected = 0
|
74
|
-
end
|
75
|
-
elsif leading_byte
|
76
|
-
if continuation_bytes_expected > 0
|
77
|
-
# Expected continuation, got leading, so clean previous
|
78
|
-
bytes[index - 1] = tidy_byte(bytes[index - 1])
|
79
|
-
continuation_bytes_expected = 0
|
72
|
+
# Impossible or highly unlikely byte? Clean it.
|
73
|
+
if is_unused || is_restricted
|
74
|
+
bytes[i] = tidy_byte(byte)
|
75
|
+
elsif is_cont
|
76
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
77
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
78
|
+
else
|
79
|
+
if conts_expected > 0
|
80
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
81
|
+
# the leading byte.
|
82
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
83
|
+
conts_expected = 0
|
80
84
|
end
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
+
if is_lead
|
86
|
+
# Final byte is leading? Clean it.
|
87
|
+
if i == bytes.length - 1
|
88
|
+
bytes[i] = tidy_byte(bytes.last)
|
89
|
+
else
|
90
|
+
# Valid leading byte? Expect continuations determined by position of
|
91
|
+
# first zero bit, with max of 3.
|
92
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
93
|
+
last_lead = i
|
94
|
+
end
|
85
95
|
end
|
86
96
|
end
|
87
|
-
# Don't allow the string to terminate with a leading byte
|
88
|
-
if leading_byte && index == bytes.length - 1
|
89
|
-
bytes[index] = tidy_byte(bytes.last)
|
90
|
-
end
|
91
97
|
end
|
92
98
|
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
93
99
|
end
|
@@ -100,17 +106,12 @@ module UTF8Utils
|
|
100
106
|
private
|
101
107
|
|
102
108
|
def tidy_byte(byte)
|
103
|
-
|
104
|
-
UTF8Utils::CP1252[byte]
|
105
|
-
elsif byte < 192
|
106
|
-
[194, byte]
|
107
|
-
else
|
108
|
-
[195, byte - 64]
|
109
|
-
end
|
109
|
+
byte < 160 ? UTF8Utils::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
110
110
|
end
|
111
|
+
|
111
112
|
end
|
112
113
|
end
|
113
114
|
|
114
115
|
class String
|
115
116
|
include UTF8Utils::StringExt
|
116
|
-
end
|
117
|
+
end
|
data/lib/utf8_utils/version.rb
CHANGED
data/test/utf8_utils_test.rb
CHANGED
@@ -1,19 +1,66 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
+
require "rubygems"
|
3
|
+
require "active_support"
|
2
4
|
require "test/unit"
|
3
5
|
require File.expand_path("../../lib/utf8_utils", __FILE__)
|
4
6
|
|
5
7
|
class UTF8UtilsTest < Test::Unit::TestCase
|
6
8
|
|
7
|
-
|
8
|
-
"
|
9
|
-
"\
|
10
|
-
"\
|
9
|
+
SINGLE_BYTE_CASES = {
|
10
|
+
"\x21" => "!", # Valid ASCII byte, low
|
11
|
+
"\x41" => "A", # Valid ASCII byte, mid
|
12
|
+
"\x7E" => "~", # Valid ASCII byte, high
|
13
|
+
"\x80" => "€", # Continuation byte, low (cp125)
|
14
|
+
"\x94" => "”", # Continuation byte, mid (cp125)
|
15
|
+
"\x9F" => "Ÿ", # Continuation byte, high (cp125)
|
16
|
+
"\xC0" => "À", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
17
|
+
"\xC1" => "Á", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
18
|
+
"\xC2" => "Â", # Start of 2-byte sequence, low
|
19
|
+
"\xC8" => "È", # Start of 2-byte sequence, mid
|
20
|
+
"\xDF" => "ß", # Start of 2-byte sequence, high
|
21
|
+
"\xE0" => "à", # Start of 3-byte sequence, low
|
22
|
+
"\xE8" => "è", # Start of 3-byte sequence, mid
|
23
|
+
"\xEF" => "ï", # Start of 3-byte sequence, high
|
24
|
+
"\xF0" => "ð", # Start of 4-byte sequence
|
25
|
+
"\xF1" => "ñ", # Unused byte
|
26
|
+
"\xFF" => "ÿ", # Restricted byte
|
11
27
|
}
|
28
|
+
|
29
|
+
def setup
|
30
|
+
# SINGLE_BYTE_CASES.each do |k, v|
|
31
|
+
# SINGLE_BYTE_CASES[k] = ActiveSupport::Multibyte::Chars.new(k)
|
32
|
+
# end
|
33
|
+
end
|
12
34
|
|
13
|
-
def
|
14
|
-
|
15
|
-
assert_equal good, bad.tidy_bytes
|
35
|
+
def test_should_handle_single_byte_cases
|
36
|
+
SINGLE_BYTE_CASES.each do |bad, good|
|
37
|
+
assert_equal good, bad.tidy_bytes.to_s
|
38
|
+
assert_equal "#{good}#{good}", "#{bad}#{bad}".tidy_bytes
|
39
|
+
assert_equal "#{good}#{good}#{good}", "#{bad}#{bad}#{bad}".tidy_bytes
|
40
|
+
assert_equal "#{good}a", "#{bad}a".tidy_bytes
|
41
|
+
assert_equal "a#{good}a", "a#{bad}a".tidy_bytes
|
42
|
+
assert_equal "a#{good}", "a#{bad}".tidy_bytes
|
16
43
|
end
|
17
44
|
end
|
45
|
+
|
46
|
+
def test_should_tidy_leading_byte_followed_by_too_few_continuation_bytes
|
47
|
+
string = "\xF0\xA5\xA4\x21"
|
48
|
+
assert_equal "ð¥¤!", string.tidy_bytes
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_should_not_modifiy_valid_utf8_unless_forced
|
52
|
+
# Nothing can be done to tidy the bytes here, because it's valid UTF-8.
|
53
|
+
assert_not_equal "𥤤", "\xF0\xA5\xA4\xA4".tidy_bytes
|
54
|
+
assert_not_equal "»", "\xC2\xBB".tidy_bytes
|
55
|
+
assert_equal "𥤤", "\xF0\xA5\xA4\xA4".tidy_bytes(true)
|
56
|
+
assert_equal "»", "\xC2\xBB".tidy_bytes(true)
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_should_not_tidy_leading_byte_followed_by_too_many_continuation_bytes_unless_forced
|
60
|
+
string = "\xF0\xA5\xA4\xA4\xA4"
|
61
|
+
assert_not_equal "𥤤¤", string.tidy_bytes
|
62
|
+
assert_equal "𥤤¤", string.tidy_bytes(true)
|
63
|
+
end
|
64
|
+
|
18
65
|
|
19
66
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 2
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 2.0.
|
8
|
+
- 1
|
9
|
+
version: 2.0.1
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Norman Clarke
|
@@ -16,19 +16,8 @@ cert_chain: []
|
|
16
16
|
|
17
17
|
date: 2010-04-08 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
|
-
dependencies:
|
20
|
-
|
21
|
-
name: mocha
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
requirements:
|
25
|
-
- - ">="
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
segments:
|
28
|
-
- 0
|
29
|
-
version: "0"
|
30
|
-
type: :development
|
31
|
-
version_requirements: *id001
|
19
|
+
dependencies: []
|
20
|
+
|
32
21
|
description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
|
33
22
|
email: norman@njclarke.com
|
34
23
|
executables: []
|