utf8_utils 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2010 Norman Clarke
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # UTF8 Utils
2
+
3
+ This library provides a means of cleaning UTF8 strings with invalid characters.
4
+
5
+ It provides functionality very similar to [ActiveSupport's `tidy_bytes`
6
+ method](http://api.rubyonrails.org/classes/ActiveSupport/Multibyte/Chars.html#M000977),
7
+ but works for Ruby 1.8.6 - 1.9.x. Once I sort out any potentially embarrassing
8
+ issues with it, I'll probably try patching it into ActiveSupport.
9
+
10
+ ## The Problem
11
+
12
+ Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
13
+
14
+ ruby-1.9.1-p378 > "my messed up \x92 string".split(//)
15
+ ArgumentError: invalid byte sequence in UTF-8
16
+ from (irb):3:in `split'
17
+ from (irb):3
18
+ from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
19
+
20
+ ## The Solution
21
+
22
+ ruby-1.9.1-p378 > "my messed up \x92 string".to_utf8_codepoints.tidy_bytes.to_s.split(//u)
23
+ => ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
24
+
25
+ Amazing in its brevity and elegance, huh? Ok, maybe not really but if you have
26
+ some badly encoded data you need to clean up, it can save you from ripping out
27
+ your hair.
28
+
29
+ Note that like ActiveSupport, it naively assumes if you have invalid UTF8
30
+ characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
31
+ a bad assumption, but may not always work.
32
+
33
+ ## Getting it
34
+
35
+ gem install utf8_utils
36
+
37
+
38
+ ## Using it
39
+
40
+ require "utf8_utils"
41
+
42
+ # Traverse codepoints
43
+ "hello-world".to_utf8_codepoints.each_codepoint do |codepoint|
44
+ puts codepoint.valid?
45
+ end
46
+
47
+ # tidy bytes
48
+ good_string = bad_string.to_utf8_codepoints.tidy_bytes.to_s
49
+
50
+ ## API Docs
51
+
52
+ [http://norman.github.com/utf8_utils](http://norman.github.com/utf8_utils)
53
+
54
+ ## Credits
55
+
56
+ Created by Norman Clarke, with some code <strike>stolen</strike> borrowed from ActiveRecord.
57
+
58
+ Copyright (c) 2010, released under the MIT license.
data/Rakefile ADDED
@@ -0,0 +1,25 @@
1
+ require "rake"
2
+ require "rake/testtask"
3
+ require "rake/gempackagetask"
4
+ require "rake/rdoctask"
5
+ require "rake/clean"
6
+
7
+ CLEAN << "pkg" << "doc" << "coverage" << ".yardoc"
8
+
9
+ Rake::GemPackageTask.new(eval(File.read("utf8_utils.gemspec"))) { |pkg| }
10
+ Rake::TestTask.new(:test) { |t| t.pattern = "test/**/*_test.rb" }
11
+
12
+ Rake::RDocTask.new do |r|
13
+ r.rdoc_dir = "doc"
14
+ r.rdoc_files.include "lib/**/*.rb"
15
+ end
16
+
17
+ begin
18
+ require "rcov/rcovtask"
19
+ Rcov::RcovTask.new do |r|
20
+ r.test_files = FileList["test/**/*_test.rb"]
21
+ r.verbose = true
22
+ r.rcov_opts << "--exclude gems/*"
23
+ end
24
+ rescue LoadError
25
+ end
@@ -0,0 +1,8 @@
1
+ module UTF8Utils
2
+ module Version
3
+ MAJOR = 0
4
+ MINOR = 0
5
+ TINY = 1
6
+ STRING = [MAJOR, MINOR, TINY].join('.')
7
+ end
8
+ end
data/lib/utf8_utils.rb ADDED
@@ -0,0 +1,156 @@
1
+ # Wraps a string as an array of bytes and allows some naive cleanup operations as a workaround
2
+ # for Ruby 1.9's crappy encoding support that throws exceptions when attempting to access
3
+ # UTF8 strings with invalid characters.
4
+ module UTF8Utils
5
+
6
+ class Codepoints
7
+
8
+ attr_accessor :chars
9
+ attr :position
10
+
11
+ include Enumerable
12
+
13
+ CP1251 = {
14
+ 128 => [226, 130, 172],
15
+ 129 => nil,
16
+ 130 => [226, 128, 154],
17
+ 131 => [198, 146],
18
+ 132 => [226, 128, 158],
19
+ 133 => [226, 128, 166],
20
+ 134 => [226, 128, 160],
21
+ 135 => [226, 128, 161],
22
+ 136 => [203, 134],
23
+ 137 => [226, 128, 176],
24
+ 138 => [197, 160],
25
+ 139 => [226, 128, 185],
26
+ 140 => [197, 146],
27
+ 141 => nil,
28
+ 142 => [197, 189],
29
+ 143 => nil,
30
+ 144 => nil,
31
+ 145 => [226, 128, 152],
32
+ 146 => [226, 128, 153],
33
+ 147 => [226, 128, 156],
34
+ 148 => [226, 128, 157],
35
+ 149 => [226, 128, 162],
36
+ 150 => [226, 128, 147],
37
+ 151 => [226, 128, 148],
38
+ 152 => [203, 156],
39
+ 153 => [226, 132, 162],
40
+ 154 => [197, 161],
41
+ 155 => [226, 128, 186],
42
+ 156 => [197, 147],
43
+ 157 => nil,
44
+ 158 => [197, 190],
45
+ 159 => [197, 184]
46
+ }
47
+
48
+ def initialize(string)
49
+ @position = 0
50
+ # 1.8.6's `each_byte` does not return an Enumerable
51
+ if RUBY_VERSION < "1.8.7"
52
+ @chars = []
53
+ string.each_byte { |b| @chars << b }
54
+ else
55
+ # Create an array of bytes without raising an ArgumentError in 1.9.x
56
+ # when the string contains invalid UTF-8 characters
57
+ @chars = string.each_byte.entries
58
+ end
59
+ end
60
+
61
+ # Attempt to clean up malformed characters.
62
+ def tidy_bytes
63
+ Codepoints.new(entries.map {|c| c.tidy.to_char}.compact.join)
64
+ end
65
+
66
+ # Cast to string.
67
+ def to_s
68
+ entries.map {|e| e.to_char}.join
69
+ end
70
+
71
+ private
72
+
73
+ def each(&block)
74
+ while codepoint = next_codepoint
75
+ yield codepoint
76
+ end
77
+ @position = 0
78
+ end
79
+
80
+ alias :each_codepoint :each
81
+ public :each_codepoint
82
+
83
+ def bytes_to_pull
84
+ case chars[position]
85
+ when 0..127 then 1
86
+ when 128..223 then 2
87
+ when 224..239 then 3
88
+ else 4
89
+ end
90
+ end
91
+
92
+ def next_codepoint
93
+ codepoint = Codepoint.new(chars.slice(position, bytes_to_pull))
94
+ if codepoint.invalid?
95
+ codepoint = Codepoint.new(chars.slice(position, 1))
96
+ end
97
+ @position = position + codepoint.size
98
+ codepoint unless codepoint.empty?
99
+ end
100
+
101
+ end
102
+
103
+ class Codepoint < Array
104
+
105
+ # Borrowed from the regexp in ActiveSupport, which in turn had been borrowed from
106
+ # the Kconv library by Shinji KONO - (also as seen on the W3C site).
107
+ # See also http://en.wikipedia.org/wiki/UTF-8
108
+ def valid?
109
+ if length == 1
110
+ (0..127) === self[0]
111
+ elsif length == 2
112
+ (192..223) === self[0] && (128..191) === self[1]
113
+ elsif length == 3
114
+ (self[0] == 224 && ((160..191) === self[1] && (128..191) === self[2])) ||
115
+ ((225..239) === self[0] && (128..191) === self[1] && (128..191) === self[2])
116
+ elsif length == 4
117
+ (self[0] == 240 && (144..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
118
+ ((241..243) === self[0] && (128..191) === self[1] && (128..191) === self[2] && (128..191) === self[3]) ||
119
+ (self[0] == 244 && (128..143) === self[1] && (128..191) === self[2] && (128..191) === self[3])
120
+ end
121
+ end
122
+
123
+ # Attempt to rescue a valid UTF-8 character from a malformed codepoint. It will first
124
+ # attempt to convert from CP1251, and if this isn't possible, it prepends a valid leading
125
+ # byte, treating the character as the last byte in a two-byte codepoint.
126
+ # Note that much of the logic here is taken from ActiveSupport; the difference is that this
127
+ # works for Ruby 1.8.6 - 1.9.1.
128
+ def tidy
129
+ return self if valid?
130
+ if Codepoints::CP1251.key? self[0]
131
+ self.class.new [Codepoints::CP1251[self[0]]]
132
+ elsif self[0] < 192
133
+ self.class.new [194, self[0]]
134
+ else
135
+ self.class.new [195, self[0] - 64]
136
+ end
137
+ end
138
+
139
+ def invalid?
140
+ !valid?
141
+ end
142
+
143
+ # Get a character from the bytes.
144
+ def to_char
145
+ flatten.pack("C*").unpack("U*").pack("U*")
146
+ end
147
+
148
+ end
149
+ end
150
+
151
+ # Get an array of UTF8 codepoints from a string.
152
+ class String
153
+ def to_utf8_codepoints
154
+ UTF8Utils::Codepoints.new self
155
+ end
156
+ end
@@ -0,0 +1,49 @@
1
+ # encoding: utf-8
2
+
3
+ require "test/unit"
4
+ require File.join(File.dirname(__FILE__), "..", "lib", "utf8_utils")
5
+
6
+ class UTF8CodepointsTest < Test::Unit::TestCase
7
+
8
+ def test_should_pull_one_byte_for_ascii_char
9
+ assert_equal 1, "a".to_utf8_codepoints.entries[0].length
10
+ end
11
+
12
+ def test_should_pull_two_bytes_for_latin_char_with_diacritics
13
+ assert_equal 2, "¡".to_utf8_codepoints.entries[0].length
14
+ end
15
+
16
+ def test_should_pull_three_bytes_for_basic_multilingual_char
17
+ assert_equal 3, "आ".to_utf8_codepoints.entries[0].length
18
+ end
19
+
20
+ def test_should_pull_four_bytes_for_other_chars
21
+ u = UTF8Utils::Codepoints.new("")
22
+ # Editors tend to freak out with chars in this plane, so just stub the
23
+ # chars field instead. This char is U+10405, DESERET CAPITAL LETTER LONG OO.
24
+ u.chars = [240, 144, 144, 132]
25
+ assert_equal 4, u.entries[0].length
26
+ end
27
+
28
+ def test_should_detect_valid_codepoints
29
+ "cañón आ".to_utf8_codepoints.each_codepoint {|c| assert c.valid? }
30
+ end
31
+
32
+ def test_should_detect_invalid_codepoints
33
+ "\x92".to_utf8_codepoints.each_codepoint {|c| assert c.invalid? }
34
+ end
35
+
36
+ def test_should_split_correctly_with_invalid_codepoints
37
+ assert_equal 3, "a\x92a".to_utf8_codepoints.entries.length
38
+ end
39
+
40
+ def test_should_tidy_bytes
41
+ assert_equal "a’a", "a\x92a".to_utf8_codepoints.tidy_bytes.to_s
42
+ end
43
+
44
+ def test_should_not_screw_up_valid_strings
45
+ s = File.read(__FILE__)
46
+ assert_equal s.to_s, s.to_utf8_codepoints.tidy_bytes.to_s
47
+ end
48
+
49
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: utf8_utils
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Norman Clarke
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-03-25 00:00:00 -03:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
22
+ email: norman@njclarke.com
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - lib/utf8_utils/version.rb
31
+ - lib/utf8_utils.rb
32
+ - README.md
33
+ - LICENSE
34
+ - Rakefile
35
+ - test/utf8_utils_test.rb
36
+ has_rdoc: true
37
+ homepage: http://norman.github.com/utf8_utils
38
+ licenses: []
39
+
40
+ post_install_message:
41
+ rdoc_options: []
42
+
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ segments:
50
+ - 0
51
+ version: "0"
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ requirements: []
60
+
61
+ rubyforge_project: utf8_utils
62
+ rubygems_version: 1.3.6
63
+ signing_key:
64
+ specification_version: 3
65
+ summary: Utilities for cleaning up UTF8 strings.
66
+ test_files:
67
+ - test/utf8_utils_test.rb