utf8_utils 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +19 -0
- data/README.md +58 -0
- data/Rakefile +25 -0
- data/lib/utf8_utils/version.rb +8 -0
- data/lib/utf8_utils.rb +156 -0
- data/test/utf8_utils_test.rb +49 -0
- metadata +67 -0
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2010 Norman Clarke
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# UTF8 Utils
|
2
|
+
|
3
|
+
This library provides a means of cleaning UTF8 strings with invalid characters.
|
4
|
+
|
5
|
+
It provides functionality very similar to [ActiveSupport's `tidy_bytes`
|
6
|
+
method](http://api.rubyonrails.org/classes/ActiveSupport/Multibyte/Chars.html#M000977),
|
7
|
+
but works for Ruby 1.8.6 - 1.9.x. Once I sort out any potentially embarrassing
|
8
|
+
issues with it, I'll probably try patching it into ActiveSupport.
|
9
|
+
|
10
|
+
## The Problem
|
11
|
+
|
12
|
+
Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
|
13
|
+
|
14
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".split(//)
|
15
|
+
ArgumentError: invalid byte sequence in UTF-8
|
16
|
+
from (irb):3:in `split'
|
17
|
+
from (irb):3
|
18
|
+
from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
|
19
|
+
|
20
|
+
## The Solution
|
21
|
+
|
22
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".to_utf8_codepoints.tidy_bytes.to_s.split(//u)
|
23
|
+
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
24
|
+
|
25
|
+
Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
|
26
|
+
some badly encoded data you need to clean up, it can save you from ripping out
|
27
|
+
your hair.
|
28
|
+
|
29
|
+
Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
30
|
+
characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
|
31
|
+
a bad assumption, but may not always work.
|
32
|
+
|
33
|
+
## Getting it
|
34
|
+
|
35
|
+
gem install utf8_utils
|
36
|
+
|
37
|
+
|
38
|
+
## Using it
|
39
|
+
|
40
|
+
require "utf8_utils"
|
41
|
+
|
42
|
+
# Traverse codepoints
|
43
|
+
"hello-world".to_utf8_codepoints.each_codepoint do |codepoint|
|
44
|
+
puts codepoint.valid?
|
45
|
+
end
|
46
|
+
|
47
|
+
# tidy bytes
|
48
|
+
good_string = bad_string.to_utf8_codepoints.tidy_bytes.to_s
|
49
|
+
|
50
|
+
## API Docs
|
51
|
+
|
52
|
+
[http://norman.github.com/utf8_utils](http://norman.github.com/utf8_utils)
|
53
|
+
|
54
|
+
## Credits
|
55
|
+
|
56
|
+
Created by Norman Clarke, with some code <strike>stolen</strike> borrowed from ActiveRecord.
|
57
|
+
|
58
|
+
Copyright (c) 2010, released under the MIT license.
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require "rake"
|
2
|
+
require "rake/testtask"
|
3
|
+
require "rake/gempackagetask"
|
4
|
+
require "rake/rdoctask"
|
5
|
+
require "rake/clean"
|
6
|
+
|
7
|
+
CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
8
|
+
|
9
|
+
Rake::GemPackageTask.new(eval(File.read("utf8_utils.gemspec"))) { |pkg| }
|
10
|
+
Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
|
11
|
+
|
12
|
+
Rake::RDocTask.new do |r|
|
13
|
+
r.rdoc_dir = "doc"
|
14
|
+
r.rdoc_files.include "lib/**/*.rb"
|
15
|
+
end
|
16
|
+
|
17
|
+
begin
|
18
|
+
require "rcov/rcovtask"
|
19
|
+
Rcov::RcovTask.new do |r|
|
20
|
+
r.test_files = FileList["test/**/*_test.rb"]
|
21
|
+
r.verbose = true
|
22
|
+
r.rcov_opts << "--exclude gems/*"
|
23
|
+
end
|
24
|
+
rescue LoadError
|
25
|
+
end
|
data/lib/utf8_utils.rb
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
# Wraps a string as an array of bytes and allows some naive cleanup operations as a workaround
|
2
|
+
# for Ruby 1.9's crappy encoding support that throws exceptions when attempting to access
|
3
|
+
# UTF8 strings with invalid characters.
|
4
|
+
module UTF8Utils
|
5
|
+
|
6
|
+
class Codepoints
|
7
|
+
|
8
|
+
attr_accessor :chars
|
9
|
+
attr :position
|
10
|
+
|
11
|
+
include Enumerable
|
12
|
+
|
13
|
+
CP1251 = {
|
14
|
+
128 => [226, 130, 172],
|
15
|
+
129 => nil,
|
16
|
+
130 => [226, 128, 154],
|
17
|
+
131 => [198, 146],
|
18
|
+
132 => [226, 128, 158],
|
19
|
+
133 => [226, 128, 166],
|
20
|
+
134 => [226, 128, 160],
|
21
|
+
135 => [226, 128, 161],
|
22
|
+
136 => [203, 134],
|
23
|
+
137 => [226, 128, 176],
|
24
|
+
138 => [197, 160],
|
25
|
+
139 => [226, 128, 185],
|
26
|
+
140 => [197, 146],
|
27
|
+
141 => nil,
|
28
|
+
142 => [197, 189],
|
29
|
+
143 => nil,
|
30
|
+
144 => nil,
|
31
|
+
145 => [226, 128, 152],
|
32
|
+
146 => [226, 128, 153],
|
33
|
+
147 => [226, 128, 156],
|
34
|
+
148 => [226, 128, 157],
|
35
|
+
149 => [226, 128, 162],
|
36
|
+
150 => [226, 128, 147],
|
37
|
+
151 => [226, 128, 148],
|
38
|
+
152 => [203, 156],
|
39
|
+
153 => [226, 132, 162],
|
40
|
+
154 => [197, 161],
|
41
|
+
155 => [226, 128, 186],
|
42
|
+
156 => [197, 147],
|
43
|
+
157 => nil,
|
44
|
+
158 => [197, 190],
|
45
|
+
159 => [197, 184]
|
46
|
+
}
|
47
|
+
|
48
|
+
def initialize(string)
|
49
|
+
@position = 0
|
50
|
+
# 1.8.6's `each_byte` does not return an Enumerable
|
51
|
+
if RUBY_VERSION < "1.8.7"
|
52
|
+
@chars = []
|
53
|
+
string.each_byte { |b| @chars << b }
|
54
|
+
else
|
55
|
+
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
56
|
+
# when the string contains invalid UTF-8 characters
|
57
|
+
@chars = string.each_byte.entries
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Attempt to clean up malformed characters.
|
62
|
+
def tidy_bytes
|
63
|
+
Codepoints.new(entries.map {|c| c.tidy.to_char}.compact.join)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Cast to string.
|
67
|
+
def to_s
|
68
|
+
entries.map {|e| e.to_char}.join
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def each(&block)
|
74
|
+
while codepoint = next_codepoint
|
75
|
+
yield codepoint
|
76
|
+
end
|
77
|
+
@position = 0
|
78
|
+
end
|
79
|
+
|
80
|
+
alias :each_codepoint :each
|
81
|
+
public :each_codepoint
|
82
|
+
|
83
|
+
def bytes_to_pull
|
84
|
+
case chars[position]
|
85
|
+
when 0..127 then 1
|
86
|
+
when 128..223 then 2
|
87
|
+
when 224..239 then 3
|
88
|
+
else 4
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def next_codepoint
|
93
|
+
codepoint = Codepoint.new(chars.slice(position, bytes_to_pull))
|
94
|
+
if codepoint.invalid?
|
95
|
+
codepoint = Codepoint.new(chars.slice(position, 1))
|
96
|
+
end
|
97
|
+
@position = position + codepoint.size
|
98
|
+
codepoint unless codepoint.empty?
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
class Codepoint < Array
|
104
|
+
|
105
|
+
# Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from
|
106
|
+
# the Kconv library by Shinji KONO - (also as seen on the W3C site).
|
107
|
+
# See also http://en.wikipedia.org/wiki/UTF-8
|
108
|
+
def valid?
|
109
|
+
if length == 1
|
110
|
+
(0..127) === self[0]
|
111
|
+
elsif length == 2
|
112
|
+
(192..223) === self[0] && (128..191) === self[1]
|
113
|
+
elsif length == 3
|
114
|
+
(self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
|
115
|
+
((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
|
116
|
+
elsif length == 4
|
117
|
+
(self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
118
|
+
((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
119
|
+
(self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first
|
124
|
+
# attempt to convert from CP1251, and if this isn't possible, it prepends a valid leading
|
125
|
+
# byte, treating the character as the last byte in a two-byte codepoint.
|
126
|
+
# Note that much of the logic here is taken from ActiveSupport; the difference is that this
|
127
|
+
# works for Ruby 1.8.6 - 1.9.1.
|
128
|
+
def tidy
|
129
|
+
return self if valid?
|
130
|
+
if Codepoints::CP1251.key? self[0]
|
131
|
+
self.class.new [Codepoints::CP1251[self[0]]]
|
132
|
+
elsif self[0] < 192
|
133
|
+
self.class.new [194, self[0]]
|
134
|
+
else
|
135
|
+
self.class.new [195, self[0] - 64]
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def invalid?
|
140
|
+
!valid?
|
141
|
+
end
|
142
|
+
|
143
|
+
# Get a character from the bytes.
|
144
|
+
def to_char
|
145
|
+
flatten.pack("C*").unpack("U*").pack("U*")
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Get an array of UTF8 codepoints from a string.
|
152
|
+
class String
|
153
|
+
def to_utf8_codepoints
|
154
|
+
UTF8Utils::Codepoints.new self
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "test/unit"
|
4
|
+
require File.join(File.dirname(__FILE__), "..", "lib", "utf8_utils")
|
5
|
+
|
6
|
+
class UTF8CodepointsTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_should_pull_one_byte_for_ascii_char
|
9
|
+
assert_equal 1, "a".to_utf8_codepoints.entries[0].length
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_should_pull_two_bytes_for_latin_char_with_diacritics
|
13
|
+
assert_equal 2, "¡".to_utf8_codepoints.entries[0].length
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_should_pull_three_bytes_for_basic_multilingual_char
|
17
|
+
assert_equal 3, "आ".to_utf8_codepoints.entries[0].length
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_should_pull_four_bytes_for_other_chars
|
21
|
+
u = UTF8Utils::Codepoints.new("")
|
22
|
+
# Editors tend to freak out with chars in this plane, so just stub the
|
23
|
+
# chars field instead. This char is U+10405, DESERET CAPITAL LETTER LONG OO.
|
24
|
+
u.chars = [240, 144, 144, 132]
|
25
|
+
assert_equal 4, u.entries[0].length
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_should_detect_valid_codepoints
|
29
|
+
"cañón आ".to_utf8_codepoints.each_codepoint {|c| assert c.valid? }
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_should_detect_invalid_codepoints
|
33
|
+
"\x92".to_utf8_codepoints.each_codepoint {|c| assert c.invalid? }
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_should_split_correctly_with_invalid_codepoints
|
37
|
+
assert_equal 3, "a\x92a".to_utf8_codepoints.entries.length
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_should_tidy_bytes
|
41
|
+
assert_equal "a’a", "a\x92a".to_utf8_codepoints.tidy_bytes.to_s
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_should_not_screw_up_valid_strings
|
45
|
+
s = File.read(__FILE__)
|
46
|
+
assert_equal s.to_s, s.to_utf8_codepoints.tidy_bytes.to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: utf8_utils
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Norman Clarke
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-25 00:00:00 -03:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
|
22
|
+
email: norman@njclarke.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/utf8_utils/version.rb
|
31
|
+
- lib/utf8_utils.rb
|
32
|
+
- README.md
|
33
|
+
- LICENSE
|
34
|
+
- Rakefile
|
35
|
+
- test/utf8_utils_test.rb
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://norman.github.com/utf8_utils
|
38
|
+
licenses: []
|
39
|
+
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
segments:
|
50
|
+
- 0
|
51
|
+
version: "0"
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
requirements: []
|
60
|
+
|
61
|
+
rubyforge_project: utf8_utils
|
62
|
+
rubygems_version: 1.3.6
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: Utilities for cleaning up UTF8 strings.
|
66
|
+
test_files:
|
67
|
+
- test/utf8_utils_test.rb
|