invoca-utils 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +5 -0
- data/Gemfile +11 -1
- data/Gemfile.lock +2 -2
- data/invoca-utils.gemspec +0 -11
- data/lib/invoca/utils.rb +8 -0
- data/lib/invoca/utils/diff.rb +290 -282
- data/lib/invoca/utils/guaranteed_utf8_string.rb +106 -45
- data/lib/invoca/utils/version.rb +1 -1
- data/test/helpers/constant_overrides.rb +44 -0
- data/test/test_helper.rb +3 -0
- data/test/unit/guaranteed_utf8_string_test.rb +191 -67
- data/test/unit/time_calculations_test.rb +0 -1
- data/test/unit/utils_test.rb +45 -2
- metadata +6 -129
@@ -1,68 +1,129 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# This class
|
3
|
+
# This class provides a normalize_string method that guarantees that its result is in valid UTF-8
|
4
|
+
# format for Ruby and all versions of MySQL (using mb3 storage).
|
5
|
+
#
|
6
|
+
# [Deprecated] Equivalently, you can also create an instance of this class and call to_string or to_s on it.
|
4
7
|
module Invoca
|
5
8
|
module Utils
|
6
9
|
class GuaranteedUTF8String
|
7
|
-
|
8
|
-
if string.is_a?(String) ||
|
9
|
-
(string.respond_to?(:to_s) &&
|
10
|
-
string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
|
11
|
-
@string = string.to_s
|
12
|
-
else
|
13
|
-
raise ArgumentError, "#{self.class} must be initialized with a string or an object with a non-Kernel .to_s method but instead was #{string.class} #{string.inspect}"
|
14
|
-
end
|
15
|
-
end
|
10
|
+
attr_reader :to_string
|
16
11
|
|
17
|
-
def
|
18
|
-
@to_string
|
12
|
+
def initialize(string)
|
13
|
+
@to_string = self.class.normalize_string(string)
|
19
14
|
end
|
20
15
|
|
21
|
-
|
16
|
+
alias to_s to_string
|
22
17
|
|
23
18
|
private
|
24
19
|
|
25
20
|
# chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON
|
26
|
-
REPLACE_CHARACTER = '~'
|
21
|
+
REPLACE_CHARACTER = '~'
|
27
22
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
23
|
+
class << self
|
24
|
+
def normalize_string(orig_string,
|
25
|
+
normalize_utf16: true,
|
26
|
+
normalize_cp1252: true,
|
27
|
+
normalize_newlines: true,
|
28
|
+
remove_utf8_bom: true,
|
29
|
+
replace_unicode_beyond_ffff: true)
|
30
|
+
string = if orig_string.is_a?(String) ||
|
31
|
+
(orig_string.respond_to?(:to_s) &&
|
32
|
+
orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
|
33
|
+
orig_string.to_s.dup
|
34
|
+
else
|
35
|
+
raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}"
|
36
|
+
end
|
37
|
+
string.force_encoding('UTF-8')
|
38
|
+
normalize_string_from_utf8(string,
|
39
|
+
normalize_utf16: normalize_utf16,
|
40
|
+
normalize_cp1252: normalize_cp1252,
|
41
|
+
normalize_newlines: normalize_newlines,
|
42
|
+
remove_utf8_bom: remove_utf8_bom,
|
43
|
+
replace_unicode_beyond_ffff: replace_unicode_beyond_ffff)
|
33
44
|
end
|
34
|
-
normalize_newlines(str)
|
35
|
-
remove_bom(str)
|
36
|
-
replace_unicode_beyond_ffff(str)
|
37
|
-
str
|
38
|
-
end
|
39
45
|
|
40
|
-
|
41
|
-
str.gsub!(/ \r\n | \r | \n /x, "\n")
|
42
|
-
end
|
46
|
+
private
|
43
47
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
48
|
+
def normalize_string_from_utf8(string,
|
49
|
+
normalize_utf16:,
|
50
|
+
normalize_cp1252:,
|
51
|
+
normalize_newlines:,
|
52
|
+
remove_utf8_bom:,
|
53
|
+
replace_unicode_beyond_ffff:)
|
54
|
+
found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16
|
55
|
+
if found_utf_16
|
56
|
+
string.encode!('UTF-8')
|
57
|
+
else
|
58
|
+
unless string.valid_encoding?
|
59
|
+
if normalize_cp1252
|
60
|
+
cp1252_to_utf_8(string)
|
61
|
+
else
|
62
|
+
raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)'
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
normalize_newlines(string) if normalize_newlines
|
67
|
+
remove_utf8_bom(string) if remove_utf8_bom
|
68
|
+
replace_unicode_beyond_ffff(string) if replace_unicode_beyond_ffff
|
69
|
+
string
|
70
|
+
end
|
53
71
|
|
54
|
-
|
55
|
-
|
56
|
-
|
72
|
+
UTF_16_LE_BOM = "\xFF\xFE"
|
73
|
+
UTF_16_BE_BOM = "\xFE\xFF"
|
74
|
+
UTF_8_BOM = "\xEF\xBB\xBF"
|
75
|
+
|
76
|
+
PRIVATE_CP1252_CHAR_PATTERN = "[\u0080-\u009f]"
|
77
|
+
PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16LE'))
|
78
|
+
PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16BE'))
|
79
|
+
|
80
|
+
# returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16
|
81
|
+
# otherwise returns falsey and leaves the string as is
|
82
|
+
def normalize_utf_16(string, normalize_cp1252:)
|
83
|
+
case string[0, 2]
|
84
|
+
when UTF_16_LE_BOM
|
85
|
+
string.slice!(0, 2) # remove the BOM
|
86
|
+
string.force_encoding('UTF-16LE')
|
87
|
+
normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252
|
88
|
+
true
|
89
|
+
when UTF_16_BE_BOM
|
90
|
+
string.slice!(0, 2) # remove the BOM
|
91
|
+
string.force_encoding('UTF-16BE')
|
92
|
+
normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252
|
93
|
+
true
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def normalize_multibyte_cp1252(string, pattern, encoding)
|
98
|
+
string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) }
|
99
|
+
end
|
57
100
|
|
58
|
-
|
59
|
-
|
60
|
-
|
101
|
+
def normalize_newlines(string)
|
102
|
+
string.gsub!(/ \r\n | \r | \n /x, "\n")
|
103
|
+
end
|
104
|
+
|
105
|
+
def cp1252_to_utf_8(string)
|
106
|
+
string.force_encoding('CP1252')
|
107
|
+
string.encode!(
|
108
|
+
'UTF-8',
|
109
|
+
replace: REPLACE_CHARACTER,
|
110
|
+
undef: :replace,
|
111
|
+
invalid: :replace
|
112
|
+
)
|
113
|
+
end
|
61
114
|
|
62
|
-
|
63
|
-
|
115
|
+
def remove_utf8_bom(string)
|
116
|
+
string.sub!(/\A #{UTF_8_BOM}/x, '')
|
117
|
+
end
|
118
|
+
|
119
|
+
# Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
|
120
|
+
# is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
|
121
|
+
# it would take a data migration and didn't seem that important.
|
122
|
+
|
123
|
+
def replace_unicode_beyond_ffff(string)
|
124
|
+
string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER)
|
125
|
+
end
|
64
126
|
end
|
65
127
|
end
|
66
128
|
end
|
67
129
|
end
|
68
|
-
|
data/lib/invoca/utils/version.rb
CHANGED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# The same as https://github.com/Invoca/test_overrides/blob/master/lib/constant_overrides.rb,
|
4
|
+
# but less coupled to ExceptionHandling and Invoca microservices.
|
5
|
+
|
6
|
+
module ConstantOverrides
|
7
|
+
def setup_constant_overrides
|
8
|
+
@constant_overrides = []
|
9
|
+
end
|
10
|
+
|
11
|
+
def cleanup_constant_overrides
|
12
|
+
@constant_overrides.reverse.each do |parent_module, k, v|
|
13
|
+
silence_warnings do
|
14
|
+
if v == :never_defined
|
15
|
+
parent_module.send(:remove_const, k)
|
16
|
+
else
|
17
|
+
parent_module.const_set(k, v)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def set_test_const(const_name, value)
|
24
|
+
const_name.is_a?(Symbol) and (const_name = const_name.to_s)
|
25
|
+
const_name.is_a?(String) or raise "Pass the constant name, not its value!"
|
26
|
+
|
27
|
+
final_parent_module = final_const_name = nil
|
28
|
+
original_value =
|
29
|
+
const_name.split('::').reduce(Object) do |parent_module, nested_const_name|
|
30
|
+
parent_module == :never_defined and raise "You need to set each parent constant earlier! #{nested_const_name}"
|
31
|
+
final_parent_module = parent_module
|
32
|
+
final_const_name = nested_const_name
|
33
|
+
begin
|
34
|
+
parent_module.const_get(nested_const_name)
|
35
|
+
rescue
|
36
|
+
:never_defined
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
@constant_overrides << [final_parent_module, final_const_name, original_value]
|
41
|
+
|
42
|
+
silence_warnings { final_parent_module.const_set(final_const_name, value) }
|
43
|
+
end
|
44
|
+
end
|
data/test/test_helper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative '../../lib/invoca/utils/guaranteed_utf8_string'
|
3
4
|
require_relative '../test_helper'
|
4
5
|
|
5
6
|
class GuaranteedUTF8StringTest < Minitest::Test
|
@@ -18,99 +19,222 @@ class GuaranteedUTF8StringTest < Minitest::Test
|
|
18
19
|
end
|
19
20
|
end
|
20
21
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
context Invoca::Utils::GuaranteedUTF8String do
|
23
|
+
context '.normalize_string' do
|
24
|
+
should 'raise an error if called with an object with no to_s method' do
|
25
|
+
ex = assert_raises ArgumentError do
|
26
|
+
Invoca::Utils::GuaranteedUTF8String.normalize_string(HasNoTo_sMethod.new)
|
27
|
+
end
|
26
28
|
|
27
|
-
|
28
|
-
|
29
|
-
Invoca::Utils::GuaranteedUTF8String.new(BasicObjectWithKernelMethods.new)
|
30
|
-
end
|
31
|
-
end
|
29
|
+
assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/, ex.message)
|
30
|
+
end
|
32
31
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
should 'raise an error if called with a basic Ruby object' do
|
33
|
+
ex = assert_raises ArgumentError do
|
34
|
+
Invoca::Utils::GuaranteedUTF8String.normalize_string(BasicObjectWithKernelMethods.new)
|
35
|
+
end
|
37
36
|
|
38
|
-
|
39
|
-
|
40
|
-
ascii_string = "new string".encode("ASCII")
|
41
|
-
utf8_string_instance = Invoca::Utils::GuaranteedUTF8String.new(ascii_string)
|
42
|
-
encoded_string = utf8_string_instance.to_string
|
37
|
+
assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/, ex.message)
|
38
|
+
end
|
43
39
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
40
|
+
should 'not mutate the original string' do
|
41
|
+
ascii_string = 'new string'.encode('ASCII')
|
42
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(ascii_string)
|
48
43
|
|
49
|
-
|
50
|
-
|
44
|
+
assert_equal ascii_string, encoded_string
|
45
|
+
assert_equal Encoding::ASCII, ascii_string.encoding
|
46
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
47
|
+
end
|
51
48
|
|
52
|
-
|
49
|
+
should 'return UTF-8 encoded string' do
|
50
|
+
original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
|
53
51
|
|
54
|
-
|
55
|
-
assert_equal Encoding::UTF_8, encoded_string.encoding
|
56
|
-
end
|
52
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string)
|
57
53
|
|
58
|
-
|
59
|
-
|
54
|
+
assert_equal original_string, encoded_string
|
55
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
56
|
+
end
|
60
57
|
|
61
|
-
|
58
|
+
context "normalize_utf16" do
|
59
|
+
UTF16_LE_BOM = "\xFF\xFE"
|
60
|
+
UTF16_BE_BOM = "\xFE\xFF"
|
61
|
+
UTF16_LE_TEST_STRING = (UTF16_LE_BOM + "v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY').freeze
|
62
|
+
UTF16_BE_TEST_STRING = (UTF16_BE_BOM + "\x00v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY').freeze
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
end
|
64
|
+
should 'accept UTF-16LE in BINARY and return UTF-8 encoded string when true' do
|
65
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: true)
|
66
66
|
|
67
|
-
|
68
|
-
|
67
|
+
assert_equal "valid,utf-16\nsecond", encoded_string
|
68
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
69
|
+
end
|
69
70
|
|
70
|
-
|
71
|
+
should 'not check for UTF-16LE in BINARY and return UTF-8 encoded string when false' do
|
72
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: false)
|
73
|
+
expected = "ÿþv\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d\u0000"
|
74
|
+
assert_equal expected, encoded_string
|
75
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
76
|
+
end
|
71
77
|
|
72
|
-
|
73
|
-
|
74
|
-
end
|
78
|
+
should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when true' do
|
79
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: true)
|
75
80
|
|
76
|
-
|
77
|
-
|
78
|
-
|
81
|
+
assert_equal "valid,utf-16\nsecond", encoded_string
|
82
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
83
|
+
end
|
79
84
|
|
80
|
-
|
85
|
+
should 'not check for UTF-16BE in BINARY and return UTF-8 encoded string when false' do
|
86
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: false)
|
87
|
+
expected = "þÿ\u0000v\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d"
|
88
|
+
assert_equal expected, encoded_string
|
89
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
90
|
+
end
|
81
91
|
|
82
|
-
|
83
|
-
|
84
|
-
|
92
|
+
context "containing embedded CP1252" do
|
93
|
+
should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with "private" CP1252 when normalize_utf16: true, normalize_cp1252: false' do
|
94
|
+
original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
|
85
95
|
|
86
|
-
|
87
|
-
string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
|
88
|
-
expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
|
96
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
|
89
97
|
|
90
|
-
|
98
|
+
assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
|
99
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
100
|
+
end
|
91
101
|
|
92
|
-
|
93
|
-
|
94
|
-
|
102
|
+
should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with normalized CP1252 when normalize_utf16: true, normalize_cp1252: true' do
|
103
|
+
original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
|
104
|
+
|
105
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true)
|
106
|
+
|
107
|
+
assert_equal "‘smart quotes’\nsecond", encoded_string
|
108
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
109
|
+
end
|
110
|
+
|
111
|
+
should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: false' do
|
112
|
+
original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
|
113
|
+
|
114
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
|
115
|
+
|
116
|
+
assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
|
117
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
118
|
+
end
|
119
|
+
|
120
|
+
should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: true' do
|
121
|
+
original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
|
122
|
+
|
123
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: true)
|
124
|
+
|
125
|
+
assert_equal "‘smart quotes’\nsecond", encoded_string
|
126
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
context 'normalize_cp1252' do
|
132
|
+
setup do
|
133
|
+
@string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five,\x91smart quotes\x92,\x93suck!\x94\n"
|
134
|
+
end
|
135
|
+
|
136
|
+
should 'raise ArgumentError when false' do
|
137
|
+
assert_raises(ArgumentError, /xxyy/) do
|
138
|
+
Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_cp1252: false)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
should 'return UTF-8 encoded string after falling back to CP1252 encoding when true' do
|
143
|
+
expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~five,‘smart quotes’,“suck!”\n"
|
95
144
|
|
96
|
-
|
97
|
-
all_8_bit_characters = (1..255).map { |char| char.chr }.join
|
145
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string)
|
98
146
|
|
99
|
-
|
100
|
-
|
147
|
+
assert_equal expected_string, encoded_string
|
148
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
149
|
+
end
|
101
150
|
|
102
|
-
|
151
|
+
should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
|
152
|
+
all_8_bit_characters = (1..255).map(&:chr).join
|
153
|
+
|
154
|
+
final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(all_8_bit_characters)
|
155
|
+
expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
156
|
+
|
157
|
+
assert_equal expected_string, final_utf_8_string
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
context 'normalize_newlines' do
|
162
|
+
setup do
|
163
|
+
@string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
|
164
|
+
end
|
165
|
+
|
166
|
+
should 'return UTF-8 encoded string without normalized return chars when false' do
|
167
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: false)
|
168
|
+
|
169
|
+
assert_equal @string, encoded_string
|
170
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
171
|
+
end
|
172
|
+
|
173
|
+
should 'return UTF-8 encoded string with normalized return chars when true' do
|
174
|
+
expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
|
175
|
+
|
176
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: true)
|
177
|
+
|
178
|
+
assert_equal expected_string, encoded_string
|
179
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
context 'remove_utf8_bom' do
|
184
|
+
setup do
|
185
|
+
@original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
|
186
|
+
end
|
187
|
+
|
188
|
+
should 'return UTF-8 encoded string with BOM intact when false' do
|
189
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: false)
|
190
|
+
|
191
|
+
assert_equal "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
|
192
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
193
|
+
end
|
194
|
+
|
195
|
+
should 'return UTF-8 encoded string without BOM when true' do
|
196
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: true)
|
197
|
+
|
198
|
+
assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
|
199
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
context 'replace_unicode_beyond_ffff' do
|
204
|
+
setup do
|
205
|
+
@string = "This string has some ✓ valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
|
206
|
+
end
|
207
|
+
|
208
|
+
should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when false" do
|
209
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: false)
|
210
|
+
|
211
|
+
assert_equal @string, encoded_string
|
212
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
213
|
+
end
|
214
|
+
|
215
|
+
should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when true" do
|
216
|
+
expected_string = 'This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF'
|
217
|
+
|
218
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: true)
|
219
|
+
|
220
|
+
assert_equal expected_string, encoded_string
|
221
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
222
|
+
end
|
223
|
+
end
|
103
224
|
end
|
104
225
|
|
105
|
-
|
106
|
-
|
107
|
-
|
226
|
+
context 'constructor' do
|
227
|
+
should 'call normalize_string with the default conversions' do
|
228
|
+
mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
|
108
229
|
|
109
|
-
|
230
|
+
Invoca::Utils::GuaranteedUTF8String.new('').to_string
|
231
|
+
end
|
110
232
|
|
111
|
-
|
112
|
-
|
233
|
+
should 'do the same when using to_s alias' do
|
234
|
+
mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
|
235
|
+
|
236
|
+
Invoca::Utils::GuaranteedUTF8String.new('').to_s
|
237
|
+
end
|
113
238
|
end
|
114
239
|
end
|
115
240
|
end
|
116
|
-
|