utf8_utils 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +20 -8
- data/lib/utf8_utils.rb +44 -43
- data/lib/utf8_utils/version.rb +1 -1
- data/test/utf8_utils_test.rb +54 -7
- metadata +4 -15
data/README.md
CHANGED
|
@@ -11,7 +11,13 @@ access at [its home on Github](github.com/norman/utf8_utils).
|
|
|
11
11
|
|
|
12
12
|
## The Problem
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
Your application may have to deal with invalid UTF-8 strings that come from
|
|
15
|
+
user input that is copied and pasted from Microsoft Word, and includes
|
|
16
|
+
Windows-encoded "smart quotes," or other characters. This is only one scenario;
|
|
17
|
+
there are many ways your application could receive such input.
|
|
18
|
+
|
|
19
|
+
Here's what happens when you try to access a string with invalid UTF-8
|
|
20
|
+
characters in Ruby 1.9:
|
|
15
21
|
|
|
16
22
|
ruby-1.9.1-p378 > "my messed up \x92 string".split(//u)
|
|
17
23
|
ArgumentError: invalid byte sequence in UTF-8
|
|
@@ -19,24 +25,30 @@ Here's what happens when you try to access a string with invalid UTF-8 character
|
|
|
19
25
|
from (irb):3
|
|
20
26
|
from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
|
|
21
27
|
|
|
28
|
+
Ruby is quite particular about this - accessing the data in the string is
|
|
29
|
+
difficult as almost all string access methods will die with this error.
|
|
30
|
+
|
|
22
31
|
## The Solution
|
|
23
32
|
|
|
33
|
+
This library breaks the string down into an array of raw bytes, and cleans up
|
|
34
|
+
the ones that are impossible UTF-8 sequences.
|
|
35
|
+
|
|
24
36
|
ruby-1.9.1-p378 > "my messed up \x92 string".tidy_bytes.split(//u)
|
|
25
37
|
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
|
26
38
|
|
|
27
|
-
Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
|
28
|
-
characters,
|
|
29
|
-
a bad assumption, but may not always work.
|
|
39
|
+
Note that, like ActiveSupport, it naively assumes if you have invalid UTF8
|
|
40
|
+
characters, their encoding is either Windows CP1252 or ISO-8859-1. In practice
|
|
41
|
+
this isn't a bad assumption, but may not always work.
|
|
30
42
|
|
|
31
43
|
This library's `tidy_bytes` method is a little less than twice as fast as the
|
|
32
44
|
one provided by ActiveSupport:
|
|
33
45
|
|
|
46
|
+
|
|
34
47
|
| ACTIVE_SUPPORT | UTF8_UTILS |
|
|
35
48
|
----------------------------------------------------------
|
|
36
|
-
tidy bytes x20000 | 1.
|
|
49
|
+
tidy bytes x20000 | 1.004 | 0.607 |
|
|
37
50
|
==========================================================
|
|
38
|
-
Total | 1.
|
|
39
|
-
|
|
51
|
+
Total | 1.004 | 0.607 |
|
|
40
52
|
|
|
41
53
|
## Getting it
|
|
42
54
|
|
|
@@ -62,4 +74,4 @@ one provided by ActiveSupport:
|
|
|
62
74
|
|
|
63
75
|
Created by Norman Clarke.
|
|
64
76
|
|
|
65
|
-
Copyright (c) 2010, released under the MIT license.
|
|
77
|
+
Copyright (c) 2010, released under the MIT license.
|
data/lib/utf8_utils.rb
CHANGED
|
@@ -45,49 +45,55 @@ module UTF8Utils
|
|
|
45
45
|
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
|
46
46
|
# CP1251 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
|
47
47
|
# always work.
|
|
48
|
-
|
|
48
|
+
#
|
|
49
|
+
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
|
|
50
|
+
# encoding is CP1252 or ISO-8859-1.
|
|
51
|
+
def tidy_bytes(force = false)
|
|
49
52
|
|
|
50
|
-
|
|
51
|
-
|
|
53
|
+
if force
|
|
54
|
+
return unpack("C*").map do |b|
|
|
55
|
+
tidy_byte(b)
|
|
56
|
+
end.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
|
57
|
+
end
|
|
52
58
|
|
|
53
|
-
bytes
|
|
59
|
+
bytes = unpack("C*")
|
|
60
|
+
conts_expected = 0
|
|
61
|
+
last_lead = 0
|
|
54
62
|
|
|
55
|
-
|
|
63
|
+
bytes.each_index do |i|
|
|
56
64
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
65
|
+
byte = bytes[i]
|
|
66
|
+
is_ascii = byte < 128
|
|
67
|
+
is_cont = byte > 127 && byte < 192
|
|
68
|
+
is_lead = byte > 191 && byte < 245
|
|
69
|
+
is_unused = byte > 240
|
|
70
|
+
is_restricted = byte > 244
|
|
60
71
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
continuation_bytes_expected = 0
|
|
74
|
-
end
|
|
75
|
-
elsif leading_byte
|
|
76
|
-
if continuation_bytes_expected > 0
|
|
77
|
-
# Expected continuation, got leading, so clean previous
|
|
78
|
-
bytes[index - 1] = tidy_byte(bytes[index - 1])
|
|
79
|
-
continuation_bytes_expected = 0
|
|
72
|
+
# Impossible or highly unlikely byte? Clean it.
|
|
73
|
+
if is_unused || is_restricted
|
|
74
|
+
bytes[i] = tidy_byte(byte)
|
|
75
|
+
elsif is_cont
|
|
76
|
+
# Not expecting contination byte? Clean up. Otherwise, now expect one less.
|
|
77
|
+
conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
|
|
78
|
+
else
|
|
79
|
+
if conts_expected > 0
|
|
80
|
+
# Expected continuation, but got ASCII or leading? Clean backwards up to
|
|
81
|
+
# the leading byte.
|
|
82
|
+
(1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
|
|
83
|
+
conts_expected = 0
|
|
80
84
|
end
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
+
if is_lead
|
|
86
|
+
# Final byte is leading? Clean it.
|
|
87
|
+
if i == bytes.length - 1
|
|
88
|
+
bytes[i] = tidy_byte(bytes.last)
|
|
89
|
+
else
|
|
90
|
+
# Valid leading byte? Expect continuations determined by position of
|
|
91
|
+
# first zero bit, with max of 3.
|
|
92
|
+
conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
|
|
93
|
+
last_lead = i
|
|
94
|
+
end
|
|
85
95
|
end
|
|
86
96
|
end
|
|
87
|
-
# Don't allow the string to terminate with a leading byte
|
|
88
|
-
if leading_byte && index == bytes.length - 1
|
|
89
|
-
bytes[index] = tidy_byte(bytes.last)
|
|
90
|
-
end
|
|
91
97
|
end
|
|
92
98
|
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
|
93
99
|
end
|
|
@@ -100,17 +106,12 @@ module UTF8Utils
|
|
|
100
106
|
private
|
|
101
107
|
|
|
102
108
|
def tidy_byte(byte)
|
|
103
|
-
|
|
104
|
-
UTF8Utils::CP1252[byte]
|
|
105
|
-
elsif byte < 192
|
|
106
|
-
[194, byte]
|
|
107
|
-
else
|
|
108
|
-
[195, byte - 64]
|
|
109
|
-
end
|
|
109
|
+
byte < 160 ? UTF8Utils::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
|
|
110
110
|
end
|
|
111
|
+
|
|
111
112
|
end
|
|
112
113
|
end
|
|
113
114
|
|
|
114
115
|
class String
|
|
115
116
|
include UTF8Utils::StringExt
|
|
116
|
-
end
|
|
117
|
+
end
|
data/lib/utf8_utils/version.rb
CHANGED
data/test/utf8_utils_test.rb
CHANGED
|
@@ -1,19 +1,66 @@
|
|
|
1
1
|
# encoding: utf-8
|
|
2
|
+
require "rubygems"
|
|
3
|
+
require "active_support"
|
|
2
4
|
require "test/unit"
|
|
3
5
|
require File.expand_path("../../lib/utf8_utils", __FILE__)
|
|
4
6
|
|
|
5
7
|
class UTF8UtilsTest < Test::Unit::TestCase
|
|
6
8
|
|
|
7
|
-
|
|
8
|
-
"
|
|
9
|
-
"\
|
|
10
|
-
"\
|
|
9
|
+
SINGLE_BYTE_CASES = {
|
|
10
|
+
"\x21" => "!", # Valid ASCII byte, low
|
|
11
|
+
"\x41" => "A", # Valid ASCII byte, mid
|
|
12
|
+
"\x7E" => "~", # Valid ASCII byte, high
|
|
13
|
+
"\x80" => "€", # Continuation byte, low (cp125)
|
|
14
|
+
"\x94" => "”", # Continuation byte, mid (cp125)
|
|
15
|
+
"\x9F" => "Ÿ", # Continuation byte, high (cp125)
|
|
16
|
+
"\xC0" => "À", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
|
17
|
+
"\xC1" => "Á", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
|
|
18
|
+
"\xC2" => "Â", # Start of 2-byte sequence, low
|
|
19
|
+
"\xC8" => "È", # Start of 2-byte sequence, mid
|
|
20
|
+
"\xDF" => "ß", # Start of 2-byte sequence, high
|
|
21
|
+
"\xE0" => "à", # Start of 3-byte sequence, low
|
|
22
|
+
"\xE8" => "è", # Start of 3-byte sequence, mid
|
|
23
|
+
"\xEF" => "ï", # Start of 3-byte sequence, high
|
|
24
|
+
"\xF0" => "ð", # Start of 4-byte sequence
|
|
25
|
+
"\xF1" => "ñ", # Unused byte
|
|
26
|
+
"\xFF" => "ÿ", # Restricted byte
|
|
11
27
|
}
|
|
28
|
+
|
|
29
|
+
def setup
|
|
30
|
+
# SINGLE_BYTE_CASES.each do |k, v|
|
|
31
|
+
# SINGLE_BYTE_CASES[k] = ActiveSupport::Multibyte::Chars.new(k)
|
|
32
|
+
# end
|
|
33
|
+
end
|
|
12
34
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
assert_equal good, bad.tidy_bytes
|
|
35
|
+
def test_should_handle_single_byte_cases
|
|
36
|
+
SINGLE_BYTE_CASES.each do |bad, good|
|
|
37
|
+
assert_equal good, bad.tidy_bytes.to_s
|
|
38
|
+
assert_equal "#{good}#{good}", "#{bad}#{bad}".tidy_bytes
|
|
39
|
+
assert_equal "#{good}#{good}#{good}", "#{bad}#{bad}#{bad}".tidy_bytes
|
|
40
|
+
assert_equal "#{good}a", "#{bad}a".tidy_bytes
|
|
41
|
+
assert_equal "a#{good}a", "a#{bad}a".tidy_bytes
|
|
42
|
+
assert_equal "a#{good}", "a#{bad}".tidy_bytes
|
|
16
43
|
end
|
|
17
44
|
end
|
|
45
|
+
|
|
46
|
+
def test_should_tidy_leading_byte_followed_by_too_few_continuation_bytes
|
|
47
|
+
string = "\xF0\xA5\xA4\x21"
|
|
48
|
+
assert_equal "ð¥¤!", string.tidy_bytes
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def test_should_not_modifiy_valid_utf8_unless_forced
|
|
52
|
+
# Nothing can be done to tidy the bytes here, because it's valid UTF-8.
|
|
53
|
+
assert_not_equal "𥤤", "\xF0\xA5\xA4\xA4".tidy_bytes
|
|
54
|
+
assert_not_equal "»", "\xC2\xBB".tidy_bytes
|
|
55
|
+
assert_equal "𥤤", "\xF0\xA5\xA4\xA4".tidy_bytes(true)
|
|
56
|
+
assert_equal "»", "\xC2\xBB".tidy_bytes(true)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def test_should_not_tidy_leading_byte_followed_by_too_many_continuation_bytes_unless_forced
|
|
60
|
+
string = "\xF0\xA5\xA4\xA4\xA4"
|
|
61
|
+
assert_not_equal "𥤤¤", string.tidy_bytes
|
|
62
|
+
assert_equal "𥤤¤", string.tidy_bytes(true)
|
|
63
|
+
end
|
|
64
|
+
|
|
18
65
|
|
|
19
66
|
end
|
metadata
CHANGED
|
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
|
5
5
|
segments:
|
|
6
6
|
- 2
|
|
7
7
|
- 0
|
|
8
|
-
-
|
|
9
|
-
version: 2.0.
|
|
8
|
+
- 1
|
|
9
|
+
version: 2.0.1
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Norman Clarke
|
|
@@ -16,19 +16,8 @@ cert_chain: []
|
|
|
16
16
|
|
|
17
17
|
date: 2010-04-08 00:00:00 -03:00
|
|
18
18
|
default_executable:
|
|
19
|
-
dependencies:
|
|
20
|
-
|
|
21
|
-
name: mocha
|
|
22
|
-
prerelease: false
|
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
|
-
requirements:
|
|
25
|
-
- - ">="
|
|
26
|
-
- !ruby/object:Gem::Version
|
|
27
|
-
segments:
|
|
28
|
-
- 0
|
|
29
|
-
version: "0"
|
|
30
|
-
type: :development
|
|
31
|
-
version_requirements: *id001
|
|
19
|
+
dependencies: []
|
|
20
|
+
|
|
32
21
|
description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
|
|
33
22
|
email: norman@njclarke.com
|
|
34
23
|
executables: []
|