utf8_utils 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +16 -31
- data/lib/utf8_utils.rb +77 -14
- data/lib/utf8_utils/version.rb +1 -1
- data/test/utf8_utils_test.rb +10 -61
- metadata +3 -6
- data/lib/utf8_utils/byte.rb +0 -86
- data/lib/utf8_utils/char.rb +0 -52
- data/lib/utf8_utils/chars.rb +0 -59
data/README.md
CHANGED
@@ -2,10 +2,12 @@
|
|
2
2
|
|
3
3
|
This library provides a means of cleaning UTF8 strings with invalid characters.
|
4
4
|
|
5
|
-
It provides functionality
|
5
|
+
It provides functionality that replaces [ActiveSupport's `tidy_bytes`
|
6
6
|
method](http://api.rubyonrails.org/classes/ActiveSupport/Multibyte/Chars.html#M000977),
|
7
|
-
|
8
|
-
|
7
|
+
with a faster algorithm that works on 1.8.6 - 1.9.x.
|
8
|
+
|
9
|
+
I will be sending this as a patch to ActiveSupport; in the mean time you can
|
10
|
+
access at [its home on Github](github.com/norman/utf8_utils).
|
9
11
|
|
10
12
|
## The Problem
|
11
13
|
|
@@ -19,35 +21,22 @@ Here's what happens when you try to access a string with invalid UTF-8 character
|
|
19
21
|
|
20
22
|
## The Solution
|
21
23
|
|
22
|
-
ruby-1.9.1-p378 > "my messed up \x92 string".
|
24
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".tidy_bytes.split(//u)
|
23
25
|
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
24
26
|
|
25
|
-
Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
|
26
|
-
some badly encoded data you need to clean up, it can save you from ripping out
|
27
|
-
your hair.
|
28
|
-
|
29
27
|
Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
30
28
|
characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
|
31
29
|
a bad assumption, but may not always work.
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
strings, it should, however, be good enough for many kinds of applications.
|
36
|
-
|
37
|
-
How poor is "very poor?" Have a look:
|
38
|
-
|
31
|
+
This library's `tidy_bytes` method is a little less than twice as fast as the
|
32
|
+
one provided by ActiveSupport:
|
39
33
|
|
40
34
|
| ACTIVE_SUPPORT | UTF8_UTILS |
|
41
35
|
----------------------------------------------------------
|
42
|
-
tidy bytes
|
36
|
+
tidy bytes x20000 | 1.008 | 0.650 |
|
43
37
|
==========================================================
|
44
|
-
Total |
|
45
|
-
|
38
|
+
Total | 1.008 | 0.650 |
|
46
39
|
|
47
|
-
This will improve quite a bit soon, as I'm pretty well aware of where the
|
48
|
-
slowness is coming from. If performance is important for you now though, by all
|
49
|
-
means use another library (if you can find one) until I've made a few more
|
50
|
-
releases.
|
51
40
|
|
52
41
|
## Getting it
|
53
42
|
|
@@ -59,13 +48,11 @@ releases.
|
|
59
48
|
# encoding: utf-8
|
60
49
|
require "utf8_utils"
|
61
50
|
|
62
|
-
#
|
63
|
-
|
64
|
-
puts char.valid?
|
65
|
-
end
|
51
|
+
# tidy bytes
|
52
|
+
good_string = bad_string.tidy_bytes
|
66
53
|
|
67
|
-
|
68
|
-
|
54
|
+
# tidy bytes in-place
|
55
|
+
string.tidy_bytes!
|
69
56
|
|
70
57
|
## API Docs
|
71
58
|
|
@@ -73,8 +60,6 @@ releases.
|
|
73
60
|
|
74
61
|
## Credits
|
75
62
|
|
76
|
-
Created by Norman Clarke.
|
77
|
-
[ActiveRecord](http://github.com/rails/rails/tree/master/activesupport/), as
|
78
|
-
indicated in the source code.
|
63
|
+
Created by Norman Clarke.
|
79
64
|
|
80
|
-
Copyright (c) 2010, released under the MIT license.
|
65
|
+
Copyright (c) 2010, released under the MIT license.
|
data/lib/utf8_utils.rb
CHANGED
@@ -1,14 +1,8 @@
|
|
1
|
-
|
2
|
-
require File.expand_path("../utf8_utils/char", __FILE__)
|
3
|
-
require File.expand_path("../utf8_utils/chars", __FILE__)
|
4
|
-
|
5
|
-
# Wraps a string as an array of bytes and allows some naive cleanup operations
|
6
|
-
# as a workaround for Ruby 1.9's crappy encoding support that throws exceptions
|
7
|
-
# when attempting to access UTF8 strings with invalid characters.
|
1
|
+
# Utilities for cleaning up UTF-8 strings with invalid characters.
|
8
2
|
module UTF8Utils
|
9
3
|
|
10
|
-
#
|
11
|
-
|
4
|
+
# CP1252 decimal byte => UTF-8 approximation as an array of bytes
|
5
|
+
CP1252 = {
|
12
6
|
128 => [226, 130, 172],
|
13
7
|
129 => nil,
|
14
8
|
130 => [226, 128, 154],
|
@@ -43,11 +37,80 @@ module UTF8Utils
|
|
43
37
|
159 => [197, 184]
|
44
38
|
}
|
45
39
|
|
46
|
-
|
40
|
+
# A mixin to Ruby's String class to add the {#tidy_bytes} and {#tidy_bytes!}
|
41
|
+
# methods.
|
42
|
+
module StringExt
|
47
43
|
|
48
|
-
#
|
49
|
-
|
50
|
-
|
51
|
-
|
44
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
45
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
46
|
+
# CP1251 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
47
|
+
# always work.
|
48
|
+
def tidy_bytes
|
49
|
+
|
50
|
+
bytes = unpack("C*")
|
51
|
+
continuation_bytes_expected = 0
|
52
|
+
|
53
|
+
bytes.each_index do |index|
|
54
|
+
|
55
|
+
byte = bytes[index]
|
56
|
+
|
57
|
+
is_continuation_byte = byte[7] == 1 && byte[6] == 0
|
58
|
+
ascii_byte = byte[7] == 0
|
59
|
+
leading_byte = byte[7] == 1 && byte[6] == 1
|
60
|
+
|
61
|
+
if is_continuation_byte
|
62
|
+
if continuation_bytes_expected > 0
|
63
|
+
continuation_bytes_expected = continuation_bytes_expected - 1
|
64
|
+
else
|
65
|
+
# Not expecting a continuation, so clean it
|
66
|
+
bytes[index] = tidy_byte(byte)
|
67
|
+
end
|
68
|
+
# ASCII byte
|
69
|
+
elsif ascii_byte
|
70
|
+
if continuation_bytes_expected > 0
|
71
|
+
# Expected continuation, got ASCII, so clean previous
|
72
|
+
bytes[index - 1] = tidy_byte(bytes[index - 1])
|
73
|
+
continuation_bytes_expected = 0
|
74
|
+
end
|
75
|
+
elsif leading_byte
|
76
|
+
if continuation_bytes_expected > 0
|
77
|
+
# Expected continuation, got leading, so clean previous
|
78
|
+
bytes[index - 1] = tidy_byte(bytes[index - 1])
|
79
|
+
continuation_bytes_expected = 0
|
80
|
+
end
|
81
|
+
continuation_bytes_expected =
|
82
|
+
if byte[5] == 0 then 1
|
83
|
+
elsif byte[4] == 0 then 2
|
84
|
+
elsif byte[3] == 0 then 3
|
85
|
+
end
|
86
|
+
end
|
87
|
+
# Don't allow the string to terminate with a leading byte
|
88
|
+
if leading_byte && index == bytes.length - 1
|
89
|
+
bytes[index] = tidy_byte(bytes.last)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
93
|
+
end
|
94
|
+
|
95
|
+
# Tidy bytes in-place.
|
96
|
+
def tidy_bytes!
|
97
|
+
replace tidy_bytes
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def tidy_byte(byte)
|
103
|
+
if UTF8Utils::CP1252.key? byte
|
104
|
+
UTF8Utils::CP1252[byte]
|
105
|
+
elsif byte < 192
|
106
|
+
[194, byte]
|
107
|
+
else
|
108
|
+
[195, byte - 64]
|
109
|
+
end
|
110
|
+
end
|
52
111
|
end
|
53
112
|
end
|
113
|
+
|
114
|
+
class String
|
115
|
+
include UTF8Utils::StringExt
|
116
|
+
end
|
data/lib/utf8_utils/version.rb
CHANGED
data/test/utf8_utils_test.rb
CHANGED
@@ -1,70 +1,19 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
require "rubygems"
|
4
2
|
require "test/unit"
|
5
|
-
require "mocha"
|
6
3
|
require File.expand_path("../../lib/utf8_utils", __FILE__)
|
7
4
|
|
8
|
-
module UTF8ByteTest
|
9
|
-
|
10
|
-
def test_leading_1_bits
|
11
|
-
[0, 128, 194, 224, 240].each_with_index do |n, i|
|
12
|
-
byte = UTF8Utils::Byte.new(n)
|
13
|
-
assert_equal i, byte.leading_1_bits
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_invalid_bytes
|
18
|
-
[192, 193, 245, 255].each do |n|
|
19
|
-
assert !UTF8Utils::Byte.new(n).valid?
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_continuation
|
24
|
-
assert UTF8Utils::Byte.new(130).continuation?
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
5
|
class UTF8UtilsTest < Test::Unit::TestCase
|
30
6
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
def test_entries_should_be_two_bytes_for_latin_char_with_diacritics
|
38
|
-
assert_equal 2, "¡".to_utf8_chars.first.length
|
39
|
-
end
|
40
|
-
|
41
|
-
def test_entries_should_be_three_bytes_for_basic_multilingual_char
|
42
|
-
assert_equal 3, "आ".to_utf8_chars.first.length
|
43
|
-
end
|
7
|
+
CASES = {
|
8
|
+
"Sim\xF3n Bol\xEDvar" => "Simón Bolívar", # utf-8 leading bytes followed by an ascii char (fix as CP1252)
|
9
|
+
"\xBFhola?" => "¿hola?", # iso-8859-1 inverted question mark
|
10
|
+
"\xFF" => "something"
|
11
|
+
}
|
44
12
|
|
45
|
-
def
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
u.stubs(:bytes).returns([240, 144, 144, 132].map { |b| UTF8Utils::Byte.new(b)})
|
50
|
-
assert_equal 4, u.first.length
|
51
|
-
end
|
52
|
-
|
53
|
-
def test_should_detect_valid_chars
|
54
|
-
"cañón आ".to_utf8_chars.each_char {|c| assert c.valid? }
|
55
|
-
end
|
56
|
-
|
57
|
-
def test_should_detect_invalid_chars
|
58
|
-
"\x92".to_utf8_chars.each_char {|c| assert c.invalid? }
|
59
|
-
end
|
60
|
-
|
61
|
-
def test_should_split_correctly_with_invalid_chars
|
62
|
-
assert_equal 3, "a\x92a".to_utf8_chars.entries.length
|
63
|
-
end
|
64
|
-
|
65
|
-
def test_should_tidy_bytes
|
66
|
-
assert_equal "a’a", "a\x92a".to_utf8_chars.tidy_bytes.to_s
|
67
|
-
assert_equal "Simón Bolívar", "Sim\xF3n Bol\xEDvar".to_utf8_chars.tidy_bytes.to_s
|
13
|
+
def test_tidy_bytes
|
14
|
+
CASES.each do |bad, good|
|
15
|
+
assert_equal good, bad.tidy_bytes
|
16
|
+
end
|
68
17
|
end
|
69
18
|
|
70
|
-
end
|
19
|
+
end
|
metadata
CHANGED
@@ -3,10 +3,10 @@ name: utf8_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
|
-
-
|
6
|
+
- 2
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
version:
|
9
|
+
version: 2.0.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Norman Clarke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-04-
|
17
|
+
date: 2010-04-08 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -38,9 +38,6 @@ extensions: []
|
|
38
38
|
extra_rdoc_files: []
|
39
39
|
|
40
40
|
files:
|
41
|
-
- lib/utf8_utils/byte.rb
|
42
|
-
- lib/utf8_utils/char.rb
|
43
|
-
- lib/utf8_utils/chars.rb
|
44
41
|
- lib/utf8_utils/version.rb
|
45
42
|
- lib/utf8_utils.rb
|
46
43
|
- README.md
|
data/lib/utf8_utils/byte.rb
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
module UTF8Utils
|
2
|
-
|
3
|
-
# A single UTF-8 byte.
|
4
|
-
class Byte
|
5
|
-
|
6
|
-
attr_reader :byte
|
7
|
-
|
8
|
-
def initialize(byte)
|
9
|
-
@byte = byte
|
10
|
-
end
|
11
|
-
|
12
|
-
def codepoint_mask
|
13
|
-
case leading_1_bits
|
14
|
-
when 0 then 0
|
15
|
-
when 1 then 0b1000_0000
|
16
|
-
when 2 then 0b1100_0000
|
17
|
-
when 3 then 0b1110_0000
|
18
|
-
when 4 then 0b1111_0000
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Is this a continuation byte?
|
23
|
-
def continuation?
|
24
|
-
leading_1_bits == 1
|
25
|
-
end
|
26
|
-
|
27
|
-
# How many continuation bytes should follow this byte?
|
28
|
-
def continuations
|
29
|
-
bits = leading_1_bits
|
30
|
-
bits < 2 ? 0 : bits - 1
|
31
|
-
end
|
32
|
-
|
33
|
-
def invalid?
|
34
|
-
!valid?
|
35
|
-
end
|
36
|
-
|
37
|
-
# From Wikipedia's entry on UTF-8:
|
38
|
-
#
|
39
|
-
# The UTF-8 encoding is variable-width, with each character represented by 1
|
40
|
-
# to 4 bytes. Each byte has 0–4 leading consecutive 1 bits followed by a zero bit
|
41
|
-
# to indicate its type. N 1 bits indicates the first byte in a N-byte sequence,
|
42
|
-
# with the exception that zero 1 bits indicates a one-byte sequence while one 1
|
43
|
-
# bit indicates a continuation byte in a multi-byte sequence (this was done for
|
44
|
-
# ASCII compatibility).
|
45
|
-
# @see http://en.wikipedia.org/wiki/Utf-8
|
46
|
-
def leading_1_bits
|
47
|
-
nibble = byte >> 4
|
48
|
-
if nibble < 0b1000 then 0 # single-byte chars
|
49
|
-
elsif nibble < 0b1100 then 1 # continuation byte
|
50
|
-
elsif nibble < 0b1110 then 2 # start of 2-byte char
|
51
|
-
elsif nibble < 0b1111 then 3 # 3-byte char
|
52
|
-
else 4 # 4-byte char
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# Start of a 2-byte sequence, but code point ≤ 127
|
57
|
-
# @see http://tools.ietf.org/html/rfc3629
|
58
|
-
def overlong?
|
59
|
-
(192..193) === byte
|
60
|
-
end
|
61
|
-
|
62
|
-
# RFC 3629 reserves 245-253 for the leading bytes of 4-6 byte sequences.
|
63
|
-
# @see http://tools.ietf.org/html/rfc3629
|
64
|
-
def restricted?
|
65
|
-
(245..253) === byte
|
66
|
-
end
|
67
|
-
|
68
|
-
def to_i
|
69
|
-
byte
|
70
|
-
end
|
71
|
-
|
72
|
-
# Bytes 254 and 255 are not defined by the original UTF-8 spec.
|
73
|
-
def undefined?
|
74
|
-
(254..255) === byte
|
75
|
-
end
|
76
|
-
|
77
|
-
def valid?
|
78
|
-
!(overlong? or restricted? or undefined?)
|
79
|
-
end
|
80
|
-
|
81
|
-
def codepoint_bits
|
82
|
-
byte ^ codepoint_mask
|
83
|
-
end
|
84
|
-
|
85
|
-
end
|
86
|
-
end
|
data/lib/utf8_utils/char.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
module UTF8Utils
|
2
|
-
|
3
|
-
class Char < Array
|
4
|
-
|
5
|
-
# Given the first byte, how many bytes long should this character be?
|
6
|
-
def expected_length
|
7
|
-
(first.continuations rescue 0) + 1
|
8
|
-
end
|
9
|
-
|
10
|
-
# Is the character invalid?
|
11
|
-
def invalid?
|
12
|
-
!valid?
|
13
|
-
end
|
14
|
-
|
15
|
-
# Attempt to rescue a valid UTF-8 character from a malformed character. It
|
16
|
-
# will first attempt to convert from CP1251, and if this isn't possible, it
|
17
|
-
# prepends a valid leading byte, treating the character as the last byte in
|
18
|
-
# a two-byte character. Note that much of the logic here is taken from
|
19
|
-
# ActiveSupport; the difference is that this works for Ruby 1.8.6 - 1.9.1.
|
20
|
-
def tidy
|
21
|
-
return self if valid?
|
22
|
-
byte = first.to_i
|
23
|
-
if UTF8Utils::CP1251.key? byte
|
24
|
-
self.class.new [UTF8Utils::CP1251[byte]]
|
25
|
-
elsif byte < 192
|
26
|
-
self.class.new [194, byte]
|
27
|
-
else
|
28
|
-
self.class.new [195, byte - 64]
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# Get a multibyte character from the bytes.
|
33
|
-
def to_s
|
34
|
-
flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
35
|
-
end
|
36
|
-
|
37
|
-
def to_codepoint
|
38
|
-
flatten.map {|b| b.to_i }.pack("C*").unpack("U*")[0]
|
39
|
-
end
|
40
|
-
|
41
|
-
def valid?
|
42
|
-
return false if length != expected_length
|
43
|
-
each_with_index do |byte, index|
|
44
|
-
return false if byte.invalid?
|
45
|
-
return false if index == 0 and byte.continuation?
|
46
|
-
return false if index > 0 and !byte.continuation?
|
47
|
-
end
|
48
|
-
true
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|
data/lib/utf8_utils/chars.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
module UTF8Utils
|
2
|
-
class Chars
|
3
|
-
|
4
|
-
attr :bytes
|
5
|
-
attr :position
|
6
|
-
|
7
|
-
include Enumerable
|
8
|
-
|
9
|
-
def initialize(string)
|
10
|
-
@position = 0
|
11
|
-
begin
|
12
|
-
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
13
|
-
# when the string contains invalid UTF-8 characters
|
14
|
-
@bytes = string.each_byte.map {|b| Byte.new(b)}
|
15
|
-
rescue LocalJumpError
|
16
|
-
# 1.8.6's `each_byte` does not return an Enumerable
|
17
|
-
@bytes = []
|
18
|
-
string.each_byte { |b| @bytes << Byte.new(b) }
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Attempt to clean up malformed characters.
|
23
|
-
def tidy_bytes
|
24
|
-
Chars.new(entries.map {|c| c.tidy.to_s}.compact.join)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Cast to string.
|
28
|
-
def to_s
|
29
|
-
entries.flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
30
|
-
end
|
31
|
-
|
32
|
-
def first
|
33
|
-
entries.first
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def each(&block)
|
39
|
-
while char = next_char
|
40
|
-
yield char
|
41
|
-
end
|
42
|
-
@position = 0
|
43
|
-
end
|
44
|
-
|
45
|
-
alias :each_char :each
|
46
|
-
public :each_char
|
47
|
-
|
48
|
-
def next_char
|
49
|
-
return if !bytes[position]
|
50
|
-
char = Char.new(bytes.slice(position, bytes[position].continuations + 1))
|
51
|
-
if char.invalid?
|
52
|
-
char = Char.new(bytes.slice(position, 1))
|
53
|
-
end
|
54
|
-
@position = position + char.size
|
55
|
-
char unless char.empty?
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
end
|