utf8_utils 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +19 -0
- data/README.md +58 -0
- data/Rakefile +25 -0
- data/lib/utf8_utils/version.rb +8 -0
- data/lib/utf8_utils.rb +156 -0
- data/test/utf8_utils_test.rb +49 -0
- metadata +67 -0
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2010 Norman Clarke
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# UTF8 Utils
|
2
|
+
|
3
|
+
This library provides a means of cleaning UTF8 strings with invalid characters.
|
4
|
+
|
5
|
+
It provides functionality very similar to [ActiveSupport's `tidy_bytes`
|
6
|
+
method](http://api.rubyonrails.org/classes/ActiveSupport/Multibyte/Chars.html#M000977),
|
7
|
+
but works for Ruby 1.8.6 - 1.9.x. Once I sort out any potentially embarrassing
|
8
|
+
issues with it, I'll probably try patching it into ActiveSupport.
|
9
|
+
|
10
|
+
## The Problem
|
11
|
+
|
12
|
+
Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
|
13
|
+
|
14
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".split(//)
|
15
|
+
ArgumentError: invalid byte sequence in UTF-8
|
16
|
+
from (irb):3:in `split'
|
17
|
+
from (irb):3
|
18
|
+
from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
|
19
|
+
|
20
|
+
## The Solution
|
21
|
+
|
22
|
+
ruby-1.9.1-p378 > "my messed up \x92 string".to_utf8_codepoints.tidy_bytes.to_s.split(//u)
|
23
|
+
=> ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
|
24
|
+
|
25
|
+
Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
|
26
|
+
some badly encoded data you need to clean up, it can save you from ripping out
|
27
|
+
your hair.
|
28
|
+
|
29
|
+
Note that like ActiveSupport, it naively assumes if you have invalid UTF8
|
30
|
+
characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
|
31
|
+
a bad assumption, but may not always work.
|
32
|
+
|
33
|
+
## Getting it
|
34
|
+
|
35
|
+
gem install utf8_utils
|
36
|
+
|
37
|
+
|
38
|
+
## Using it
|
39
|
+
|
40
|
+
require "utf8_utils"
|
41
|
+
|
42
|
+
# Traverse codepoints
|
43
|
+
"hello-world".to_utf8_codepoints.each_codepoint do |codepoint|
|
44
|
+
puts codepoint.valid?
|
45
|
+
end
|
46
|
+
|
47
|
+
# tidy bytes
|
48
|
+
good_string = bad_string.to_utf8_codepoints.tidy_bytes.to_s
|
49
|
+
|
50
|
+
## API Docs
|
51
|
+
|
52
|
+
[http://norman.github.com/utf8_utils](http://norman.github.com/utf8_utils)
|
53
|
+
|
54
|
+
## Credits
|
55
|
+
|
56
|
+
Created by Norman Clarke, with some code <strike>stolen</strike> borrowed from ActiveRecord.
|
57
|
+
|
58
|
+
Copyright (c) 2010, released under the MIT license.
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require "rake"
|
2
|
+
require "rake/testtask"
|
3
|
+
require "rake/gempackagetask"
|
4
|
+
require "rake/rdoctask"
|
5
|
+
require "rake/clean"
|
6
|
+
|
7
|
+
CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
|
8
|
+
|
9
|
+
Rake::GemPackageTask.new(eval(File.read("utf8_utils.gemspec"))) { |pkg| }
|
10
|
+
Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
|
11
|
+
|
12
|
+
Rake::RDocTask.new do |r|
|
13
|
+
r.rdoc_dir = "doc"
|
14
|
+
r.rdoc_files.include "lib/**/*.rb"
|
15
|
+
end
|
16
|
+
|
17
|
+
begin
|
18
|
+
require "rcov/rcovtask"
|
19
|
+
Rcov::RcovTask.new do |r|
|
20
|
+
r.test_files = FileList["test/**/*_test.rb"]
|
21
|
+
r.verbose = true
|
22
|
+
r.rcov_opts << "--exclude gems/*"
|
23
|
+
end
|
24
|
+
rescue LoadError
|
25
|
+
end
|
data/lib/utf8_utils.rb
ADDED
@@ -0,0 +1,156 @@
|
|
1
|
+
# Wraps a string as an array of bytes and allows some naive cleanup operations as a workaround
|
2
|
+
# for Ruby 1.9's crappy encoding support that throws exceptions when attempting to access
|
3
|
+
# UTF8 strings with invalid characters.
|
4
|
+
module UTF8Utils
|
5
|
+
|
6
|
+
class Codepoints
|
7
|
+
|
8
|
+
attr_accessor :chars
|
9
|
+
attr :position
|
10
|
+
|
11
|
+
include Enumerable
|
12
|
+
|
13
|
+
CP1251 = {
|
14
|
+
128 => [226, 130, 172],
|
15
|
+
129 => nil,
|
16
|
+
130 => [226, 128, 154],
|
17
|
+
131 => [198, 146],
|
18
|
+
132 => [226, 128, 158],
|
19
|
+
133 => [226, 128, 166],
|
20
|
+
134 => [226, 128, 160],
|
21
|
+
135 => [226, 128, 161],
|
22
|
+
136 => [203, 134],
|
23
|
+
137 => [226, 128, 176],
|
24
|
+
138 => [197, 160],
|
25
|
+
139 => [226, 128, 185],
|
26
|
+
140 => [197, 146],
|
27
|
+
141 => nil,
|
28
|
+
142 => [197, 189],
|
29
|
+
143 => nil,
|
30
|
+
144 => nil,
|
31
|
+
145 => [226, 128, 152],
|
32
|
+
146 => [226, 128, 153],
|
33
|
+
147 => [226, 128, 156],
|
34
|
+
148 => [226, 128, 157],
|
35
|
+
149 => [226, 128, 162],
|
36
|
+
150 => [226, 128, 147],
|
37
|
+
151 => [226, 128, 148],
|
38
|
+
152 => [203, 156],
|
39
|
+
153 => [226, 132, 162],
|
40
|
+
154 => [197, 161],
|
41
|
+
155 => [226, 128, 186],
|
42
|
+
156 => [197, 147],
|
43
|
+
157 => nil,
|
44
|
+
158 => [197, 190],
|
45
|
+
159 => [197, 184]
|
46
|
+
}
|
47
|
+
|
48
|
+
def initialize(string)
|
49
|
+
@position = 0
|
50
|
+
# 1.8.6's `each_byte` does not return an Enumerable
|
51
|
+
if RUBY_VERSION < "1.8.7"
|
52
|
+
@chars = []
|
53
|
+
string.each_byte { |b| @chars << b }
|
54
|
+
else
|
55
|
+
# Create an array of bytes without raising an ArgumentError in 1.9.x
|
56
|
+
# when the string contains invalid UTF-8 characters
|
57
|
+
@chars = string.each_byte.entries
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Attempt to clean up malformed characters.
|
62
|
+
def tidy_bytes
|
63
|
+
Codepoints.new(entries.map {|c| c.tidy.to_char}.compact.join)
|
64
|
+
end
|
65
|
+
|
66
|
+
# Cast to string.
|
67
|
+
def to_s
|
68
|
+
entries.map {|e| e.to_char}.join
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
|
73
|
+
def each(&block)
|
74
|
+
while codepoint = next_codepoint
|
75
|
+
yield codepoint
|
76
|
+
end
|
77
|
+
@position = 0
|
78
|
+
end
|
79
|
+
|
80
|
+
alias :each_codepoint :each
|
81
|
+
public :each_codepoint
|
82
|
+
|
83
|
+
def bytes_to_pull
|
84
|
+
case chars[position]
|
85
|
+
when 0..127 then 1
|
86
|
+
when 128..223 then 2
|
87
|
+
when 224..239 then 3
|
88
|
+
else 4
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def next_codepoint
|
93
|
+
codepoint = Codepoint.new(chars.slice(position, bytes_to_pull))
|
94
|
+
if codepoint.invalid?
|
95
|
+
codepoint = Codepoint.new(chars.slice(position, 1))
|
96
|
+
end
|
97
|
+
@position = position + codepoint.size
|
98
|
+
codepoint unless codepoint.empty?
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
class Codepoint < Array
|
104
|
+
|
105
|
+
# Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from
|
106
|
+
# the Kconv library by Shinji KONO - (also as seen on the W3C site).
|
107
|
+
# See also http://en.wikipedia.org/wiki/UTF-8
|
108
|
+
def valid?
|
109
|
+
if length == 1
|
110
|
+
(0..127) === self[0]
|
111
|
+
elsif length == 2
|
112
|
+
(192..223) === self[0] && (128..191) === self[1]
|
113
|
+
elsif length == 3
|
114
|
+
(self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
|
115
|
+
((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
|
116
|
+
elsif length == 4
|
117
|
+
(self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
118
|
+
((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
|
119
|
+
(self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first
|
124
|
+
# attempt to convert from CP1251, and if this isn't possible, it prepends a valid leading
|
125
|
+
# byte, treating the character as the last byte in a two-byte codepoint.
|
126
|
+
# Note that much of the logic here is taken from ActiveSupport; the difference is that this
|
127
|
+
# works for Ruby 1.8.6 - 1.9.1.
|
128
|
+
def tidy
|
129
|
+
return self if valid?
|
130
|
+
if Codepoints::CP1251.key? self[0]
|
131
|
+
self.class.new [Codepoints::CP1251[self[0]]]
|
132
|
+
elsif self[0] < 192
|
133
|
+
self.class.new [194, self[0]]
|
134
|
+
else
|
135
|
+
self.class.new [195, self[0] - 64]
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
def invalid?
|
140
|
+
!valid?
|
141
|
+
end
|
142
|
+
|
143
|
+
# Get a character from the bytes.
|
144
|
+
def to_char
|
145
|
+
flatten.pack("C*").unpack("U*").pack("U*")
|
146
|
+
end
|
147
|
+
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
# Get an array of UTF8 codepoints from a string.
|
152
|
+
class String
|
153
|
+
def to_utf8_codepoints
|
154
|
+
UTF8Utils::Codepoints.new self
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "test/unit"
|
4
|
+
require File.join(File.dirname(__FILE__), "..", "lib", "utf8_utils")
|
5
|
+
|
6
|
+
class UTF8CodepointsTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_should_pull_one_byte_for_ascii_char
|
9
|
+
assert_equal 1, "a".to_utf8_codepoints.entries[0].length
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_should_pull_two_bytes_for_latin_char_with_diacritics
|
13
|
+
assert_equal 2, "¡".to_utf8_codepoints.entries[0].length
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_should_pull_three_bytes_for_basic_multilingual_char
|
17
|
+
assert_equal 3, "आ".to_utf8_codepoints.entries[0].length
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_should_pull_four_bytes_for_other_chars
|
21
|
+
u = UTF8Utils::Codepoints.new("")
|
22
|
+
# Editors tend to freak out with chars in this plane, so just stub the
|
23
|
+
# chars field instead. This char is U+10405, DESERET CAPITAL LETTER LONG OO.
|
24
|
+
u.chars = [240, 144, 144, 132]
|
25
|
+
assert_equal 4, u.entries[0].length
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_should_detect_valid_codepoints
|
29
|
+
"cañón आ".to_utf8_codepoints.each_codepoint {|c| assert c.valid? }
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_should_detect_invalid_codepoints
|
33
|
+
"\x92".to_utf8_codepoints.each_codepoint {|c| assert c.invalid? }
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_should_split_correctly_with_invalid_codepoints
|
37
|
+
assert_equal 3, "a\x92a".to_utf8_codepoints.entries.length
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_should_tidy_bytes
|
41
|
+
assert_equal "a’a", "a\x92a".to_utf8_codepoints.tidy_bytes.to_s
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_should_not_screw_up_valid_strings
|
45
|
+
s = File.read(__FILE__)
|
46
|
+
assert_equal s.to_s, s.to_utf8_codepoints.tidy_bytes.to_s
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: utf8_utils
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Norman Clarke
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-03-25 00:00:00 -03:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
|
22
|
+
email: norman@njclarke.com
|
23
|
+
executables: []
|
24
|
+
|
25
|
+
extensions: []
|
26
|
+
|
27
|
+
extra_rdoc_files: []
|
28
|
+
|
29
|
+
files:
|
30
|
+
- lib/utf8_utils/version.rb
|
31
|
+
- lib/utf8_utils.rb
|
32
|
+
- README.md
|
33
|
+
- LICENSE
|
34
|
+
- Rakefile
|
35
|
+
- test/utf8_utils_test.rb
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://norman.github.com/utf8_utils
|
38
|
+
licenses: []
|
39
|
+
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
segments:
|
50
|
+
- 0
|
51
|
+
version: "0"
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
requirements: []
|
60
|
+
|
61
|
+
rubyforge_project: utf8_utils
|
62
|
+
rubygems_version: 1.3.6
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: Utilities for cleaning up UTF8 strings.
|
66
|
+
test_files:
|
67
|
+
- test/utf8_utils_test.rb
|