utf8_utils 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +16 -31
- data/lib/utf8_utils.rb +77 -14
- data/lib/utf8_utils/version.rb +1 -1
- data/test/utf8_utils_test.rb +10 -61
- metadata +3 -6
- data/lib/utf8_utils/byte.rb +0 -86
- data/lib/utf8_utils/char.rb +0 -52
- data/lib/utf8_utils/chars.rb +0 -59
data/README.md
CHANGED
@@ -2,10 +2,12 @@
|
|
2
2
|
|
3
3
|
This library provides a means of cleaning UTF8 strings with invalid characters.
|
4
4
|
|
5
|
-
It provides functionality
|
5
|
+
It provides functionality that replaces [ActiveSupport's `tidy_bytes`
|
6
6
|
method](http://api.rubyonrails.org/classes/ActiveSupport/Multibyte/Chars.html#M000977),
|
7
|
-
|
8
|
-
|
7
|
+
with a faster algorithm that works on 1.8.6 - 1.9.x.
|
8
|
+
|
9
|
+
I will be sending this as a patch to ActiveSupport; in the mean time you can
|
10
|
+
access at [its home on Github](github.com/norman/utf8_utils).
|
9
11
|
|
10
12
|
## The Problem
|
11
13
|
|
@@ -19,35 +21,22 @@ Here's what happens when you try to access a string with invalid UTF-8 character
|
|
19
21
|
|
20
22
|
## The Solution
|
21
23
|
|
22
|
-
ruby-1.9.1-p378 > "my messed up \x92 string".
|
24
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".tidy_bytes.split(//u)
|
23
25
|
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
24
26
|
|
25
|
-
Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
|
26
|
-
some badly encoded data you need to clean up, it can save you from ripping out
|
27
|
-
your hair.
|
28
|
-
|
29
27
|
Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
30
28
|
characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
|
31
29
|
a bad assumption, but may not always work.
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
strings, it should, however, be good enough for many kinds of applications.
|
36
|
-
|
37
|
-
How poor is "very poor?" Have a look:
|
38
|
-
|
31
|
+
This library's `tidy_bytes` method is a little less than twice as fast as the
|
32
|
+
one provided by ActiveSupport:
|
39
33
|
|
40
34
|
| ACTIVE_SUPPORT | UTF8_UTILS |
|
41
35
|
----------------------------------------------------------
|
42
|
-
tidy bytes
|
36
|
+
tidy bytes x20000 | 1.008 | 0.650 |
|
43
37
|
==========================================================
|
44
|
-
Total |
|
45
|
-
|
38
|
+
Total | 1.008 | 0.650 |
|
46
39
|
|
47
|
-
This will improve quite a bit soon, as I'm pretty well aware of where the
|
48
|
-
slowness is coming from. If performance is important for you now though, by all
|
49
|
-
means use another library (if you can find one) until I've made a few more
|
50
|
-
releases.
|
51
40
|
|
52
41
|
## Getting it
|
53
42
|
|
@@ -59,13 +48,11 @@ releases.
|
|
59
48
|
# encoding: utf-8
|
60
49
|
require "utf8_utils"
|
61
50
|
|
62
|
-
#
|
63
|
-
|
64
|
-
puts char.valid?
|
65
|
-
end
|
51
|
+
# tidy bytes
|
52
|
+
good_string = bad_string.tidy_bytes
|
66
53
|
|
67
|
-
|
68
|
-
|
54
|
+
# tidy bytes in-place
|
55
|
+
string.tidy_bytes!
|
69
56
|
|
70
57
|
## API Docs
|
71
58
|
|
@@ -73,8 +60,6 @@ releases.
|
|
73
60
|
|
74
61
|
## Credits
|
75
62
|
|
76
|
-
Created by Norman Clarke.
|
77
|
-
[ActiveRecord](http://github.com/rails/rails/tree/master/activesupport/), as
|
78
|
-
indicated in the source code.
|
63
|
+
Created by Norman Clarke.
|
79
64
|
|
80
|
-
Copyright (c) 2010, released under the MIT license.
|
65
|
+
Copyright (c) 2010, released under the MIT license.
|
data/lib/utf8_utils.rb
CHANGED
@@ -1,14 +1,8 @@
|
|
1
|
-
|
2
|
-
require File.expand_path("../utf8_utils/char", __FILE__)
|
3
|
-
require File.expand_path("../utf8_utils/chars", __FILE__)
|
4
|
-
|
5
|
-
# Wraps a string as an array of bytes and allows some naive cleanup operations
|
6
|
-
# as a workaround for Ruby 1.9's crappy encoding support that throws exceptions
|
7
|
-
# when attempting to access UTF8 strings with invalid characters.
|
1
|
+
# Utilities for cleaning up UTF-8 strings with invalid characters.
|
8
2
|
module UTF8Utils
|
9
3
|
|
10
|
-
#
|
11
|
-
|
4
|
+
# CP1252 decimal byte => UTF-8 approximation as an array of bytes
|
5
|
+
CP1252 = {
|
12
6
|
128 => [226, 130, 172],
|
13
7
|
129 => nil,
|
14
8
|
130 => [226, 128, 154],
|
@@ -43,11 +37,80 @@ module UTF8Utils
|
|
43
37
|
159 => [197, 184]
|
44
38
|
}
|
45
39
|
|
46
|
-
|
40
|
+
# A mixin to Ruby's String class to add the {#tidy_bytes} and {#tidy_bytes!}
|
41
|
+
# methods.
|
42
|
+
module StringExt
|
47
43
|
|
48
|
-
#
|
49
|
-
|
50
|
-
|
51
|
-
|
44
|
+
# Attempt to replace invalid UTF-8 bytes with valid ones. This method
|
45
|
+
# naively assumes if you have invalid UTF8 bytes, they are either Windows
|
46
|
+
# CP1251 or ISO8859-1. In practice this isn't a bad assumption, but may not
|
47
|
+
# always work.
|
48
|
+
def tidy_bytes
|
49
|
+
|
50
|
+
bytes = unpack("C*")
|
51
|
+
continuation_bytes_expected = 0
|
52
|
+
|
53
|
+
bytes.each_index do |index|
|
54
|
+
|
55
|
+
byte = bytes[index]
|
56
|
+
|
57
|
+
is_continuation_byte = byte[7] == 1 && byte[6] == 0
|
58
|
+
ascii_byte = byte[7] == 0
|
59
|
+
leading_byte = byte[7] == 1 && byte[6] == 1
|
60
|
+
|
61
|
+
if is_continuation_byte
|
62
|
+
if continuation_bytes_expected > 0
|
63
|
+
continuation_bytes_expected = continuation_bytes_expected - 1
|
64
|
+
else
|
65
|
+
# Not expecting a continuation, so clean it
|
66
|
+
bytes[index] = tidy_byte(byte)
|
67
|
+
end
|
68
|
+
# ASCII byte
|
69
|
+
elsif ascii_byte
|
70
|
+
if continuation_bytes_expected > 0
|
71
|
+
# Expected continuation, got ASCII, so clean previous
|
72
|
+
bytes[index - 1] = tidy_byte(bytes[index - 1])
|
73
|
+
continuation_bytes_expected = 0
|
74
|
+
end
|
75
|
+
elsif leading_byte
|
76
|
+
if continuation_bytes_expected > 0
|
77
|
+
# Expected continuation, got leading, so clean previous
|
78
|
+
bytes[index - 1] = tidy_byte(bytes[index - 1])
|
79
|
+
continuation_bytes_expected = 0
|
80
|
+
end
|
81
|
+
continuation_bytes_expected =
|
82
|
+
if byte[5] == 0 then 1
|
83
|
+
elsif byte[4] == 0 then 2
|
84
|
+
elsif byte[3] == 0 then 3
|
85
|
+
end
|
86
|
+
end
|
87
|
+
# Don't allow the string to terminate with a leading byte
|
88
|
+
if leading_byte && index == bytes.length - 1
|
89
|
+
bytes[index] = tidy_byte(bytes.last)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
|
93
|
+
end
|
94
|
+
|
95
|
+
# Tidy bytes in-place.
|
96
|
+
def tidy_bytes!
|
97
|
+
replace tidy_bytes
|
98
|
+
end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
def tidy_byte(byte)
|
103
|
+
if UTF8Utils::CP1252.key? byte
|
104
|
+
UTF8Utils::CP1252[byte]
|
105
|
+
elsif byte < 192
|
106
|
+
[194, byte]
|
107
|
+
else
|
108
|
+
[195, byte - 64]
|
109
|
+
end
|
110
|
+
end
|
52
111
|
end
|
53
112
|
end
|
113
|
+
|
114
|
+
class String
|
115
|
+
include UTF8Utils::StringExt
|
116
|
+
end
|
data/lib/utf8_utils/version.rb
CHANGED
data/test/utf8_utils_test.rb
CHANGED
@@ -1,70 +1,19 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
require "rubygems"
|
4
2
|
require "test/unit"
|
5
|
-
require "mocha"
|
6
3
|
require File.expand_path("../../lib/utf8_utils", __FILE__)
|
7
4
|
|
8
|
-
module UTF8ByteTest
|
9
|
-
|
10
|
-
def test_leading_1_bits
|
11
|
-
[0, 128, 194, 224, 240].each_with_index do |n, i|
|
12
|
-
byte = UTF8Utils::Byte.new(n)
|
13
|
-
assert_equal i, byte.leading_1_bits
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
def test_invalid_bytes
|
18
|
-
[192, 193, 245, 255].each do |n|
|
19
|
-
assert !UTF8Utils::Byte.new(n).valid?
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
def test_continuation
|
24
|
-
assert UTF8Utils::Byte.new(130).continuation?
|
25
|
-
end
|
26
|
-
|
27
|
-
end
|
28
|
-
|
29
5
|
class UTF8UtilsTest < Test::Unit::TestCase
|
30
6
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
def test_entries_should_be_two_bytes_for_latin_char_with_diacritics
|
38
|
-
assert_equal 2, "¡".to_utf8_chars.first.length
|
39
|
-
end
|
40
|
-
|
41
|
-
def test_entries_should_be_three_bytes_for_basic_multilingual_char
|
42
|
-
assert_equal 3, "आ".to_utf8_chars.first.length
|
43
|
-
end
|
7
|
+
CASES = {
|
8
|
+
"Sim\xF3n Bol\xEDvar" => "Simón Bolívar", # utf-8 leading bytes followed by an ascii char (fix as CP1252)
|
9
|
+
"\xBFhola?" => "¿hola?", # iso-8859-1 inverted question mark
|
10
|
+
"\xFF" => "something"
|
11
|
+
}
|
44
12
|
|
45
|
-
def
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
u.stubs(:bytes).returns([240, 144, 144, 132].map { |b| UTF8Utils::Byte.new(b)})
|
50
|
-
assert_equal 4, u.first.length
|
51
|
-
end
|
52
|
-
|
53
|
-
def test_should_detect_valid_chars
|
54
|
-
"cañón आ".to_utf8_chars.each_char {|c| assert c.valid? }
|
55
|
-
end
|
56
|
-
|
57
|
-
def test_should_detect_invalid_chars
|
58
|
-
"\x92".to_utf8_chars.each_char {|c| assert c.invalid? }
|
59
|
-
end
|
60
|
-
|
61
|
-
def test_should_split_correctly_with_invalid_chars
|
62
|
-
assert_equal 3, "a\x92a".to_utf8_chars.entries.length
|
63
|
-
end
|
64
|
-
|
65
|
-
def test_should_tidy_bytes
|
66
|
-
assert_equal "a’a", "a\x92a".to_utf8_chars.tidy_bytes.to_s
|
67
|
-
assert_equal "Simón Bolívar", "Sim\xF3n Bol\xEDvar".to_utf8_chars.tidy_bytes.to_s
|
13
|
+
def test_tidy_bytes
|
14
|
+
CASES.each do |bad, good|
|
15
|
+
assert_equal good, bad.tidy_bytes
|
16
|
+
end
|
68
17
|
end
|
69
18
|
|
70
|
-
end
|
19
|
+
end
|
metadata
CHANGED
@@ -3,10 +3,10 @@ name: utf8_utils
|
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
|
-
-
|
6
|
+
- 2
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
version:
|
9
|
+
version: 2.0.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Norman Clarke
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-04-
|
17
|
+
date: 2010-04-08 00:00:00 -03:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -38,9 +38,6 @@ extensions: []
|
|
38
38
|
extra_rdoc_files: []
|
39
39
|
|
40
40
|
files:
|
41
|
-
- lib/utf8_utils/byte.rb
|
42
|
-
- lib/utf8_utils/char.rb
|
43
|
-
- lib/utf8_utils/chars.rb
|
44
41
|
- lib/utf8_utils/version.rb
|
45
42
|
- lib/utf8_utils.rb
|
46
43
|
- README.md
|
data/lib/utf8_utils/byte.rb
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
module UTF8Utils
|
2
|
-
|
3
|
-
# A single UTF-8 byte.
|
4
|
-
class Byte
|
5
|
-
|
6
|
-
attr_reader :byte
|
7
|
-
|
8
|
-
def initialize(byte)
|
9
|
-
@byte = byte
|
10
|
-
end
|
11
|
-
|
12
|
-
def codepoint_mask
|
13
|
-
case leading_1_bits
|
14
|
-
when 0 then 0
|
15
|
-
when 1 then 0b1000_0000
|
16
|
-
when 2 then 0b1100_0000
|
17
|
-
when 3 then 0b1110_0000
|
18
|
-
when 4 then 0b1111_0000
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Is this a continuation byte?
|
23
|
-
def continuation?
|
24
|
-
leading_1_bits == 1
|
25
|
-
end
|
26
|
-
|
27
|
-
# How many continuation bytes should follow this byte?
|
28
|
-
def continuations
|
29
|
-
bits = leading_1_bits
|
30
|
-
bits < 2 ? 0 : bits - 1
|
31
|
-
end
|
32
|
-
|
33
|
-
def invalid?
|
34
|
-
!valid?
|
35
|
-
end
|
36
|
-
|
37
|
-
# From Wikipedia's entry on UTF-8:
|
38
|
-
#
|
39
|
-
# The UTF-8 encoding is variable-width, with each character represented by 1
|
40
|
-
# to 4 bytes. Each byte has 0–4 leading consecutive 1 bits followed by a zero bit
|
41
|
-
# to indicate its type. N 1 bits indicates the first byte in a N-byte sequence,
|
42
|
-
# with the exception that zero 1 bits indicates a one-byte sequence while one 1
|
43
|
-
# bit indicates a continuation byte in a multi-byte sequence (this was done for
|
44
|
-
# ASCII compatibility).
|
45
|
-
# @see http://en.wikipedia.org/wiki/Utf-8
|
46
|
-
def leading_1_bits
|
47
|
-
nibble = byte >> 4
|
48
|
-
if nibble < 0b1000 then 0 # single-byte chars
|
49
|
-
elsif nibble < 0b1100 then 1 # continuation byte
|
50
|
-
elsif nibble < 0b1110 then 2 # start of 2-byte char
|
51
|
-
elsif nibble < 0b1111 then 3 # 3-byte char
|
52
|
-
else 4 # 4-byte char
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
56
|
-
# Start of a 2-byte sequence, but code point ≤ 127
|
57
|
-
# @see http://tools.ietf.org/html/rfc3629
|
58
|
-
def overlong?
|
59
|
-
(192..193) === byte
|
60
|
-
end
|
61
|
-
|
62
|
-
# RFC 3629 reserves 245-253 for the leading bytes of 4-6 byte sequences.
|
63
|
-
# @see http://tools.ietf.org/html/rfc3629
|
64
|
-
def restricted?
|
65
|
-
(245..253) === byte
|
66
|
-
end
|
67
|
-
|
68
|
-
def to_i
|
69
|
-
byte
|
70
|
-
end
|
71
|
-
|
72
|
-
# Bytes 254 and 255 are not defined by the original UTF-8 spec.
|
73
|
-
def undefined?
|
74
|
-
(254..255) === byte
|
75
|
-
end
|
76
|
-
|
77
|
-
def valid?
|
78
|
-
!(overlong? or restricted? or undefined?)
|
79
|
-
end
|
80
|
-
|
81
|
-
def codepoint_bits
|
82
|
-
byte ^ codepoint_mask
|
83
|
-
end
|
84
|
-
|
85
|
-
end
|
86
|
-
end
|
data/lib/utf8_utils/char.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
module UTF8Utils
|
2
|
-
|
3
|
-
class Char < Array
|
4
|
-
|
5
|
-
# Given the first byte, how many bytes long should this character be?
|
6
|
-
def expected_length
|
7
|
-
(first.continuations rescue 0) + 1
|
8
|
-
end
|
9
|
-
|
10
|
-
# Is the character invalid?
|
11
|
-
def invalid?
|
12
|
-
!valid?
|
13
|
-
end
|
14
|
-
|
15
|
-
# Attempt to rescue a valid UTF-8 character from a malformed character. It
|
16
|
-
# will first attempt to convert from CP1251, and if this isn't possible, it
|
17
|
-
# prepends a valid leading byte, treating the character as the last byte in
|
18
|
-
# a two-byte character. Note that much of the logic here is taken from
|
19
|
-
# ActiveSupport; the difference is that this works for Ruby 1.8.6 - 1.9.1.
|
20
|
-
def tidy
|
21
|
-
return self if valid?
|
22
|
-
byte = first.to_i
|
23
|
-
if UTF8Utils::CP1251.key? byte
|
24
|
-
self.class.new [UTF8Utils::CP1251[byte]]
|
25
|
-
elsif byte < 192
|
26
|
-
self.class.new [194, byte]
|
27
|
-
else
|
28
|
-
self.class.new [195, byte - 64]
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# Get a multibyte character from the bytes.
|
33
|
-
def to_s
|
34
|
-
flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
35
|
-
end
|
36
|
-
|
37
|
-
def to_codepoint
|
38
|
-
flatten.map {|b| b.to_i }.pack("C*").unpack("U*")[0]
|
39
|
-
end
|
40
|
-
|
41
|
-
def valid?
|
42
|
-
return false if length != expected_length
|
43
|
-
each_with_index do |byte, index|
|
44
|
-
return false if byte.invalid?
|
45
|
-
return false if index == 0 and byte.continuation?
|
46
|
-
return false if index > 0 and !byte.continuation?
|
47
|
-
end
|
48
|
-
true
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
end
|
data/lib/utf8_utils/chars.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
module UTF8Utils
|
2
|
-
class Chars
|
3
|
-
|
4
|
-
attr :bytes
|
5
|
-
attr :position
|
6
|
-
|
7
|
-
include Enumerable
|
8
|
-
|
9
|
-
def initialize(string)
|
10
|
-
@position = 0
|
11
|
-
begin
|
12
|
-
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
13
|
-
# when the string contains invalid UTF-8 characters
|
14
|
-
@bytes = string.each_byte.map {|b| Byte.new(b)}
|
15
|
-
rescue LocalJumpError
|
16
|
-
# 1.8.6's `each_byte` does not return an Enumerable
|
17
|
-
@bytes = []
|
18
|
-
string.each_byte { |b| @bytes << Byte.new(b) }
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# Attempt to clean up malformed characters.
|
23
|
-
def tidy_bytes
|
24
|
-
Chars.new(entries.map {|c| c.tidy.to_s}.compact.join)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Cast to string.
|
28
|
-
def to_s
|
29
|
-
entries.flatten.map {|b| b.to_i }.pack("C*").unpack("U*").pack("U*")
|
30
|
-
end
|
31
|
-
|
32
|
-
def first
|
33
|
-
entries.first
|
34
|
-
end
|
35
|
-
|
36
|
-
private
|
37
|
-
|
38
|
-
def each(&block)
|
39
|
-
while char = next_char
|
40
|
-
yield char
|
41
|
-
end
|
42
|
-
@position = 0
|
43
|
-
end
|
44
|
-
|
45
|
-
alias :each_char :each
|
46
|
-
public :each_char
|
47
|
-
|
48
|
-
def next_char
|
49
|
-
return if !bytes[position]
|
50
|
-
char = Char.new(bytes.slice(position, bytes[position].continuations + 1))
|
51
|
-
if char.invalid?
|
52
|
-
char = Char.new(bytes.slice(position, 1))
|
53
|
-
end
|
54
|
-
@position = position + char.size
|
55
|
-
char unless char.empty?
|
56
|
-
end
|
57
|
-
|
58
|
-
end
|
59
|
-
end
|