utf8_utils 2.0.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -11,7 +11,13 @@ access at [its home on Github](github.com/norman/utf8_utils).
11
11
 
12
12
  ## The Problem
13
13
 
14
- Here's what happens when you try to access a string with invalid UTF-8 characters in Ruby 1.9:
14
+ Your application may have to deal with invalid UTF-8 strings that come from
15
+ user input that is copied and pasted from Microsoft Word, and includes
16
+ Windows-encoded "smart quotes," or other characters. This is only one scenario;
17
+ there are many ways your application could receive such input.
18
+
19
+ Here's what happens when you try to access a string with invalid UTF-8
20
+ characters in Ruby 1.9:
15
21
 
16
22
  ruby-1.9.1-p378 > "my messed up \x92 string".split(//u)
17
23
  ArgumentError: invalid byte sequence in UTF-8
@@ -19,24 +25,30 @@ Here's what happens when you try to access a string with invalid UTF-8 character
19
25
  from (irb):3
20
26
  from /Users/norman/.rvm/rubies/ruby-1.9.1-p378/bin/irb:17:in `<main>'
21
27
 
28
+ Ruby is quite particular about this - accessing the data in the string is
29
+ difficult as almost all string access methods will die with this error.
30
+
22
31
  ## The Solution
23
32
 
33
+ This library breaks the string down into an array of raw bytes, and cleans up
34
+ the ones that are impossible UTF-8 sequences.
35
+
24
36
  ruby-1.9.1-p378 > "my messed up \x92 string".tidy_bytes.split(//u)
25
37
  => ["m", "y", " ", "m", "e", "s", "s", "e", "d", " ", "u", "p", " ", "’", " ", "s", "t", "r", "i", "n", "g"]
26
38
 
27
- Note that like ActiveSupport, it naively assumes if you have invalid UTF8
28
- characters, they are either Windows CP1251 or ISO8859-1. In practice this isn't
29
- a bad assumption, but may not always work.
39
+ Note that, like ActiveSupport, it naively assumes if you have invalid UTF8
40
+ characters, their encoding is either Windows CP1252 or ISO-8859-1. In practice
41
+ this isn't a bad assumption, but may not always work.
30
42
 
31
43
  This library's `tidy_bytes` method is a little less than twice as fast as the
32
44
  one provided by ActiveSupport:
33
45
 
46
+
34
47
  | ACTIVE_SUPPORT | UTF8_UTILS |
35
48
  ----------------------------------------------------------
36
- tidy bytes x20000 | 1.008 | 0.650 |
49
+ tidy bytes x20000 | 1.004 | 0.607 |
37
50
  ==========================================================
38
- Total | 1.008 | 0.650 |
39
-
51
+ Total | 1.004 | 0.607 |
40
52
 
41
53
  ## Getting it
42
54
 
@@ -62,4 +74,4 @@ one provided by ActiveSupport:
62
74
 
63
75
  Created by Norman Clarke.
64
76
 
65
- Copyright (c) 2010, released under the MIT license.
77
+ Copyright (c) 2010, released under the MIT license.
@@ -45,49 +45,55 @@ module UTF8Utils
45
45
  # naively assumes if you have invalid UTF8 bytes, they are either Windows
46
46
  # CP1251 or ISO8859-1. In practice this isn't a bad assumption, but may not
47
47
  # always work.
48
- def tidy_bytes
48
+ #
49
+ # Passing +true+ will forcibly tidy all bytes, assuming that the string's
50
+ # encoding is CP1252 or ISO-8859-1.
51
+ def tidy_bytes(force = false)
49
52
 
50
- bytes = unpack("C*")
51
- continuation_bytes_expected = 0
53
+ if force
54
+ return unpack("C*").map do |b|
55
+ tidy_byte(b)
56
+ end.flatten.compact.pack("C*").unpack("U*").pack("U*")
57
+ end
52
58
 
53
- bytes.each_index do |index|
59
+ bytes = unpack("C*")
60
+ conts_expected = 0
61
+ last_lead = 0
54
62
 
55
- byte = bytes[index]
63
+ bytes.each_index do |i|
56
64
 
57
- is_continuation_byte = byte[7] == 1 && byte[6] == 0
58
- ascii_byte = byte[7] == 0
59
- leading_byte = byte[7] == 1 && byte[6] == 1
65
+ byte = bytes[i]
66
+ is_ascii = byte < 128
67
+ is_cont = byte > 127 && byte < 192
68
+ is_lead = byte > 191 && byte < 245
69
+ is_unused = byte > 240
70
+ is_restricted = byte > 244
60
71
 
61
- if is_continuation_byte
62
- if continuation_bytes_expected > 0
63
- continuation_bytes_expected = continuation_bytes_expected - 1
64
- else
65
- # Not expecting a continuation, so clean it
66
- bytes[index] = tidy_byte(byte)
67
- end
68
- # ASCII byte
69
- elsif ascii_byte
70
- if continuation_bytes_expected > 0
71
- # Expected continuation, got ASCII, so clean previous
72
- bytes[index - 1] = tidy_byte(bytes[index - 1])
73
- continuation_bytes_expected = 0
74
- end
75
- elsif leading_byte
76
- if continuation_bytes_expected > 0
77
- # Expected continuation, got leading, so clean previous
78
- bytes[index - 1] = tidy_byte(bytes[index - 1])
79
- continuation_bytes_expected = 0
72
+ # Impossible or highly unlikely byte? Clean it.
73
+ if is_unused || is_restricted
74
+ bytes[i] = tidy_byte(byte)
75
+ elsif is_cont
76
+ # Not expecting contination byte? Clean up. Otherwise, now expect one less.
77
+ conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
78
+ else
79
+ if conts_expected > 0
80
+ # Expected continuation, but got ASCII or leading? Clean backwards up to
81
+ # the leading byte.
82
+ (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
83
+ conts_expected = 0
80
84
  end
81
- continuation_bytes_expected =
82
- if byte[5] == 0 then 1
83
- elsif byte[4] == 0 then 2
84
- elsif byte[3] == 0 then 3
85
+ if is_lead
86
+ # Final byte is leading? Clean it.
87
+ if i == bytes.length - 1
88
+ bytes[i] = tidy_byte(bytes.last)
89
+ else
90
+ # Valid leading byte? Expect continuations determined by position of
91
+ # first zero bit, with max of 3.
92
+ conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
93
+ last_lead = i
94
+ end
85
95
  end
86
96
  end
87
- # Don't allow the string to terminate with a leading byte
88
- if leading_byte && index == bytes.length - 1
89
- bytes[index] = tidy_byte(bytes.last)
90
- end
91
97
  end
92
98
  bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
93
99
  end
@@ -100,17 +106,12 @@ module UTF8Utils
100
106
  private
101
107
 
102
108
  def tidy_byte(byte)
103
- if UTF8Utils::CP1252.key? byte
104
- UTF8Utils::CP1252[byte]
105
- elsif byte < 192
106
- [194, byte]
107
- else
108
- [195, byte - 64]
109
- end
109
+ byte < 160 ? UTF8Utils::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
110
110
  end
111
+
111
112
  end
112
113
  end
113
114
 
114
115
  class String
115
116
  include UTF8Utils::StringExt
116
- end
117
+ end
@@ -2,7 +2,7 @@ module UTF8Utils
2
2
  module Version
3
3
  MAJOR = 2
4
4
  MINOR = 0
5
- TINY = 0
5
+ TINY = 1
6
6
  STRING = [MAJOR, MINOR, TINY].join('.')
7
7
  end
8
8
  end
@@ -1,19 +1,66 @@
1
1
  # encoding: utf-8
2
+ require "rubygems"
3
+ require "active_support"
2
4
  require "test/unit"
3
5
  require File.expand_path("../../lib/utf8_utils", __FILE__)
4
6
 
5
7
  class UTF8UtilsTest < Test::Unit::TestCase
6
8
 
7
- CASES = {
8
- "Sim\xF3n Bol\xEDvar" => "Simón Bolívar", # utf-8 leading bytes followed by an ascii char (fix as CP1252)
9
- "\xBFhola?" => "¿hola?", # iso-8859-1 inverted question mark
10
- "\xFF" => "something"
9
+ SINGLE_BYTE_CASES = {
10
+ "\x21" => "!", # Valid ASCII byte, low
11
+ "\x41" => "A", # Valid ASCII byte, mid
12
+ "\x7E" => "~", # Valid ASCII byte, high
13
+ "\x80" => "€", # Continuation byte, low (cp125)
14
+ "\x94" => "”", # Continuation byte, mid (cp125)
15
+ "\x9F" => "Ÿ", # Continuation byte, high (cp125)
16
+ "\xC0" => "À", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
17
+ "\xC1" => "Á", # Overlong encoding, start of 2-byte sequence, but codepoint < 128
18
+ "\xC2" => "Â", # Start of 2-byte sequence, low
19
+ "\xC8" => "È", # Start of 2-byte sequence, mid
20
+ "\xDF" => "ß", # Start of 2-byte sequence, high
21
+ "\xE0" => "à", # Start of 3-byte sequence, low
22
+ "\xE8" => "è", # Start of 3-byte sequence, mid
23
+ "\xEF" => "ï", # Start of 3-byte sequence, high
24
+ "\xF0" => "ð", # Start of 4-byte sequence
25
+ "\xF1" => "ñ", # Unused byte
26
+ "\xFF" => "ÿ", # Restricted byte
11
27
  }
28
+
29
+ def setup
30
+ # SINGLE_BYTE_CASES.each do |k, v|
31
+ # SINGLE_BYTE_CASES[k] = ActiveSupport::Multibyte::Chars.new(k)
32
+ # end
33
+ end
12
34
 
13
- def test_tidy_bytes
14
- CASES.each do |bad, good|
15
- assert_equal good, bad.tidy_bytes
35
+ def test_should_handle_single_byte_cases
36
+ SINGLE_BYTE_CASES.each do |bad, good|
37
+ assert_equal good, bad.tidy_bytes.to_s
38
+ assert_equal "#{good}#{good}", "#{bad}#{bad}".tidy_bytes
39
+ assert_equal "#{good}#{good}#{good}", "#{bad}#{bad}#{bad}".tidy_bytes
40
+ assert_equal "#{good}a", "#{bad}a".tidy_bytes
41
+ assert_equal "a#{good}a", "a#{bad}a".tidy_bytes
42
+ assert_equal "a#{good}", "a#{bad}".tidy_bytes
16
43
  end
17
44
  end
45
+
46
+ def test_should_tidy_leading_byte_followed_by_too_few_continuation_bytes
47
+ string = "\xF0\xA5\xA4\x21"
48
+ assert_equal "ð¥¤!", string.tidy_bytes
49
+ end
50
+
51
+ def test_should_not_modifiy_valid_utf8_unless_forced
52
+ # Nothing can be done to tidy the bytes here, because it's valid UTF-8.
53
+ assert_not_equal "𥤤", "\xF0\xA5\xA4\xA4".tidy_bytes
54
+ assert_not_equal "»", "\xC2\xBB".tidy_bytes
55
+ assert_equal "𥤤", "\xF0\xA5\xA4\xA4".tidy_bytes(true)
56
+ assert_equal "»", "\xC2\xBB".tidy_bytes(true)
57
+ end
58
+
59
+ def test_should_not_tidy_leading_byte_followed_by_too_many_continuation_bytes_unless_forced
60
+ string = "\xF0\xA5\xA4\xA4\xA4"
61
+ assert_not_equal "𥤤¤", string.tidy_bytes
62
+ assert_equal "𥤤¤", string.tidy_bytes(true)
63
+ end
64
+
18
65
 
19
66
  end
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 2
7
7
  - 0
8
- - 0
9
- version: 2.0.0
8
+ - 1
9
+ version: 2.0.1
10
10
  platform: ruby
11
11
  authors:
12
12
  - Norman Clarke
@@ -16,19 +16,8 @@ cert_chain: []
16
16
 
17
17
  date: 2010-04-08 00:00:00 -03:00
18
18
  default_executable:
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: mocha
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- requirements:
25
- - - ">="
26
- - !ruby/object:Gem::Version
27
- segments:
28
- - 0
29
- version: "0"
30
- type: :development
31
- version_requirements: *id001
19
+ dependencies: []
20
+
32
21
  description: Utilities for cleaning up UTF8 strings. Compatible with Ruby 1.8.6 - 1.9.x
33
22
  email: norman@njclarke.com
34
23
  executables: []