utf8_utils 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2010 Norman Clarke
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # UTF8 Utils
2
+
3
+ This library provides a means of cleaning UTF8 strings with invalid characters.
4
+
5
+ It provides functionality very similar to [ActiveSupport's `tidy_bytes`
6
+ method](http://api.rubyonrails.org/classes/ActiveSupport/Multibyte/Chars.html#M000977),
7
+ but works for Ruby 1.8.6 - 1.9.x. Once I sort out any potentially embarrassing
8
+ issues with it, I'll probably try patching it into ActiveSupport.
9
+
10
+ ## The Problem
11
+
12
+ Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
13
+
14
+ ruby-1.9.1-p378 > "my messed up \x92 string".split(//)
15
+ ArgumentError: invalid byte sequence in UTF-8
16
+ from (irb):3:in `split'
17
+ from (irb):3
18
+ from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
19
+
20
+ ## The Solution
21
+
22
+ ruby-1.9.1-p378 > "my messed up \x92 string".to_utf8_codepoints.tidy_bytes.to_s.split(//u)
23
+ => ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
24
+
25
+ Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
26
+ some badly encoded data you need to clean up, it can save you from ripping out
27
+ your hair.
28
+
29
+ Note that like ActiveSupport, it naively assumes if you have invalid UTF8
30
+ characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
31
+ a bad assumption, but may not always work.
32
+
33
+ ## Getting it
34
+
35
+ gem install utf8_utils
36
+
37
+
38
+ ## Using it
39
+
40
+ require "utf8_utils"
41
+
42
+ # Traverse codepoints
43
+ "hello-world".to_utf8_codepoints.each_codepoint do |codepoint|
44
+ puts codepoint.valid?
45
+ end
46
+
47
+ # tidy bytes
48
+ good_string = bad_string.to_utf8_codepoints.tidy_bytes.to_s
49
+
50
+ ## API Docs
51
+
52
+ [http://norman.github.com/utf8_utils](http://norman.github.com/utf8_utils)
53
+
54
+ ## Credits
55
+
56
+ Created by Norman Clarke, with some code <strike>stolen</strike> borrowed from ActiveRecord.
57
+
58
+ Copyright (c) 2010, released under the MIT license.
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ require "rake"
2
+ require "rake/testtask"
3
+ require "rake/gempackagetask"
4
+ require "rake/rdoctask"
5
+ require "rake/clean"
6
+
7
+ CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
8
+
9
+ Rake::GemPackageTask.new(eval(File.read("utf8_utils.gemspec"))) { |pkg| }
10
+ Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
11
+
12
+ Rake::RDocTask.new do |r|
13
+ r.rdoc_dir = "doc"
14
+ r.rdoc_files.include "lib/**/*.rb"
15
+ end
16
+
17
+ begin
18
+ require "rcov/rcovtask"
19
+ Rcov::RcovTask.new do |r|
20
+ r.test_files = FileList["test/**/*_test.rb"]
21
+ r.verbose = true
22
+ r.rcov_opts << "--exclude gems/*"
23
+ end
24
+ rescue LoadError
25
+ end
@@ -0,0 +1,8 @@
1
+ module UTF8Utils
2
+ module Version
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+ STRING = [MAJOR, MINOR, TINY].join('.')
7
+ end
8
+ end
data/lib/utf8_utils.rb ADDED
@@ -0,0 +1,156 @@
1
+ # Wraps a string as an array of bytes and allows some naive cleanup operations as a workaround
2
+ # for Ruby 1.9's crappy encoding support that throws exceptions when attempting to access
3
+ # UTF8 strings with invalid characters.
4
+ module UTF8Utils
5
+
6
+ class Codepoints
7
+
8
+ attr_accessor :chars
9
+ attr :position
10
+
11
+ include Enumerable
12
+
13
+ CP1251 = {
14
+ 128 => [226, 130, 172],
15
+ 129 => nil,
16
+ 130 => [226, 128, 154],
17
+ 131 => [198, 146],
18
+ 132 => [226, 128, 158],
19
+ 133 => [226, 128, 166],
20
+ 134 => [226, 128, 160],
21
+ 135 => [226, 128, 161],
22
+ 136 => [203, 134],
23
+ 137 => [226, 128, 176],
24
+ 138 => [197, 160],
25
+ 139 => [226, 128, 185],
26
+ 140 => [197, 146],
27
+ 141 => nil,
28
+ 142 => [197, 189],
29
+ 143 => nil,
30
+ 144 => nil,
31
+ 145 => [226, 128, 152],
32
+ 146 => [226, 128, 153],
33
+ 147 => [226, 128, 156],
34
+ 148 => [226, 128, 157],
35
+ 149 => [226, 128, 162],
36
+ 150 => [226, 128, 147],
37
+ 151 => [226, 128, 148],
38
+ 152 => [203, 156],
39
+ 153 => [226, 132, 162],
40
+ 154 => [197, 161],
41
+ 155 => [226, 128, 186],
42
+ 156 => [197, 147],
43
+ 157 => nil,
44
+ 158 => [197, 190],
45
+ 159 => [197, 184]
46
+ }
47
+
48
+ def initialize(string)
49
+ @position = 0
50
+ # 1.8.6's `each_byte` does not return an Enumerable
51
+ if RUBY_VERSION < "1.8.7"
52
+ @chars = []
53
+ string.each_byte { |b| @chars << b }
54
+ else
55
+ # Create an array of bytes without raising an ArgumentError in 1.9.x
56
+ # when the string contains invalid UTF-8 characters
57
+ @chars = string.each_byte.entries
58
+ end
59
+ end
60
+
61
+ # Attempt to clean up malformed characters.
62
+ def tidy_bytes
63
+ Codepoints.new(entries.map {|c| c.tidy.to_char}.compact.join)
64
+ end
65
+
66
+ # Cast to string.
67
+ def to_s
68
+ entries.map {|e| e.to_char}.join
69
+ end
70
+
71
+ private
72
+
73
+ def each(&block)
74
+ while codepoint = next_codepoint
75
+ yield codepoint
76
+ end
77
+ @position = 0
78
+ end
79
+
80
+ alias :each_codepoint :each
81
+ public :each_codepoint
82
+
83
+ def bytes_to_pull
84
+ case chars[position]
85
+ when 0..127 then 1
86
+ when 128..223 then 2
87
+ when 224..239 then 3
88
+ else 4
89
+ end
90
+ end
91
+
92
+ def next_codepoint
93
+ codepoint = Codepoint.new(chars.slice(position, bytes_to_pull))
94
+ if codepoint.invalid?
95
+ codepoint = Codepoint.new(chars.slice(position, 1))
96
+ end
97
+ @position = position + codepoint.size
98
+ codepoint unless codepoint.empty?
99
+ end
100
+
101
+ end
102
+
103
+ class Codepoint < Array
104
+
105
+ # Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from
106
+ # the Kconv library by Shinji KONO - (also as seen on the W3C site).
107
+ # See also http://en.wikipedia.org/wiki/UTF-8
108
+ def valid?
109
+ if length == 1
110
+ (0..127) === self[0]
111
+ elsif length == 2
112
+ (192..223) === self[0] && (128..191) === self[1]
113
+ elsif length == 3
114
+ (self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
115
+ ((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
116
+ elsif length == 4
117
+ (self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
118
+ ((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
119
+ (self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
120
+ end
121
+ end
122
+
123
+ # Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first
124
+ # attempt to convert from CP1251, and if this isn't possible, it prepends a valid leading
125
+ # byte, treating the character as the last byte in a two-byte codepoint.
126
+ # Note that much of the logic here is taken from ActiveSupport; the difference is that this
127
+ # works for Ruby 1.8.6 - 1.9.1.
128
+ def tidy
129
+ return self if valid?
130
+ if Codepoints::CP1251.key? self[0]
131
+ self.class.new [Codepoints::CP1251[self[0]]]
132
+ elsif self[0] < 192
133
+ self.class.new [194, self[0]]
134
+ else
135
+ self.class.new [195, self[0] - 64]
136
+ end
137
+ end
138
+
139
+ def invalid?
140
+ !valid?
141
+ end
142
+
143
+ # Get a character from the bytes.
144
+ def to_char
145
+ flatten.pack("C*").unpack("U*").pack("U*")
146
+ end
147
+
148
+ end
149
+ end
150
+
151
+ # Get an array of UTF8 codepoints from a string.
152
+ class String
153
+ def to_utf8_codepoints
154
+ UTF8Utils::Codepoints.new self
155
+ end
156
+ end
@@ -0,0 +1,49 @@
1
+ # encoding: utf-8
2
+
3
+ require "test/unit"
4
+ require File.join(File.dirname(__FILE__), "..", "lib", "utf8_utils")
5
+
6
+ class UTF8CodepointsTest < Test::Unit::TestCase
7
+
8
+ def test_should_pull_one_byte_for_ascii_char
9
+ assert_equal 1, "a".to_utf8_codepoints.entries[0].length
10
+ end
11
+
12
+ def test_should_pull_two_bytes_for_latin_char_with_diacritics
13
+ assert_equal 2, "¡".to_utf8_codepoints.entries[0].length
14
+ end
15
+
16
+ def test_should_pull_three_bytes_for_basic_multilingual_char
17
+ assert_equal 3, "आ".to_utf8_codepoints.entries[0].length
18
+ end
19
+
20
+ def test_should_pull_four_bytes_for_other_chars
21
+ u = UTF8Utils::Codepoints.new("")
22
+ # Editors tend to freak out with chars in this plane, so just stub the
23
+ # chars field instead. This char is U+10405, DESERET CAPITAL LETTER LONG OO.
24
+ u.chars = [240, 144, 144, 132]
25
+ assert_equal 4, u.entries[0].length
26
+ end
27
+
28
+ def test_should_detect_valid_codepoints
29
+ "cañón आ".to_utf8_codepoints.each_codepoint {|c| assert c.valid? }
30
+ end
31
+
32
+ def test_should_detect_invalid_codepoints
33
+ "\x92".to_utf8_codepoints.each_codepoint {|c| assert c.invalid? }
34
+ end
35
+
36
+ def test_should_split_correctly_with_invalid_codepoints
37
+ assert_equal 3, "a\x92a".to_utf8_codepoints.entries.length
38
+ end
39
+
40
+ def test_should_tidy_bytes
41
+ assert_equal "a’a", "a\x92a".to_utf8_codepoints.tidy_bytes.to_s
42
+ end
43
+
44
+ def test_should_not_screw_up_valid_strings
45
+ s = File.read(__FILE__)
46
+ assert_equal s.to_s, s.to_utf8_codepoints.tidy_bytes.to_s
47
+ end
48
+
49
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utf8_utils
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Norman Clarke
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-25 00:00:00 -03:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
22
+ email: norman@njclarke.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/utf8_utils/version.rb
31
+ - lib/utf8_utils.rb
32
+ - README.md
33
+ - LICENSE
34
+ - Rakefile
35
+ - test/utf8_utils_test.rb
36
+ has_rdoc: true
37
+ homepage: http://norman.github.com/utf8_utils
38
+ licenses: []
39
+
40
+ post_install_message:
41
+ rdoc_options: []
42
+
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ segments:
50
+ - 0
51
+ version: "0"
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ requirements: []
60
+
61
+ rubyforge_project: utf8_utils
62
+ rubygems_version: 1.3.6
63
+ signing_key:
64
+ specification_version: 3
65
+ summary: Utilities for cleaning up UTF8 strings.
66
+ test_files:
67
+ - test/utf8_utils_test.rb