invoca-utils 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.rubocop.yml +5 -0
- data/Gemfile +11 -1
- data/Gemfile.lock +2 -2
- data/invoca-utils.gemspec +0 -11
- data/lib/invoca/utils.rb +8 -0
- data/lib/invoca/utils/diff.rb +290 -282
- data/lib/invoca/utils/guaranteed_utf8_string.rb +106 -45
- data/lib/invoca/utils/version.rb +1 -1
- data/test/helpers/constant_overrides.rb +44 -0
- data/test/test_helper.rb +3 -0
- data/test/unit/guaranteed_utf8_string_test.rb +191 -67
- data/test/unit/time_calculations_test.rb +0 -1
- data/test/unit/utils_test.rb +45 -2
- metadata +6 -129
@@ -1,68 +1,129 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# This class
|
3
|
+
# This class provides a normalize_string method that guarantees that its result is in valid UTF-8
|
4
|
+
# format for Ruby and all versions of MySQL (using mb3 storage).
|
5
|
+
#
|
6
|
+
# [Deprecated] Equivalently, you can also create an instance of this class and call to_string or to_s on it.
|
4
7
|
module Invoca
|
5
8
|
module Utils
|
6
9
|
class GuaranteedUTF8String
|
7
|
-
|
8
|
-
if string.is_a?(String) ||
|
9
|
-
(string.respond_to?(:to_s) &&
|
10
|
-
string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
|
11
|
-
@string = string.to_s
|
12
|
-
else
|
13
|
-
raise ArgumentError, "#{self.class} must be initialized with a string or an object with a non-Kernel .to_s method but instead was #{string.class} #{string.inspect}"
|
14
|
-
end
|
15
|
-
end
|
10
|
+
attr_reader :to_string
|
16
11
|
|
17
|
-
def
|
18
|
-
@to_string
|
12
|
+
def initialize(string)
|
13
|
+
@to_string = self.class.normalize_string(string)
|
19
14
|
end
|
20
15
|
|
21
|
-
|
16
|
+
alias to_s to_string
|
22
17
|
|
23
18
|
private
|
24
19
|
|
25
20
|
# chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON
|
26
|
-
REPLACE_CHARACTER = '~'
|
21
|
+
REPLACE_CHARACTER = '~'
|
27
22
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
23
|
+
class << self
|
24
|
+
def normalize_string(orig_string,
|
25
|
+
normalize_utf16: true,
|
26
|
+
normalize_cp1252: true,
|
27
|
+
normalize_newlines: true,
|
28
|
+
remove_utf8_bom: true,
|
29
|
+
replace_unicode_beyond_ffff: true)
|
30
|
+
string = if orig_string.is_a?(String) ||
|
31
|
+
(orig_string.respond_to?(:to_s) &&
|
32
|
+
orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
|
33
|
+
orig_string.to_s.dup
|
34
|
+
else
|
35
|
+
raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}"
|
36
|
+
end
|
37
|
+
string.force_encoding('UTF-8')
|
38
|
+
normalize_string_from_utf8(string,
|
39
|
+
normalize_utf16: normalize_utf16,
|
40
|
+
normalize_cp1252: normalize_cp1252,
|
41
|
+
normalize_newlines: normalize_newlines,
|
42
|
+
remove_utf8_bom: remove_utf8_bom,
|
43
|
+
replace_unicode_beyond_ffff: replace_unicode_beyond_ffff)
|
33
44
|
end
|
34
|
-
normalize_newlines(str)
|
35
|
-
remove_bom(str)
|
36
|
-
replace_unicode_beyond_ffff(str)
|
37
|
-
str
|
38
|
-
end
|
39
45
|
|
40
|
-
|
41
|
-
str.gsub!(/ \r\n | \r | \n /x, "\n")
|
42
|
-
end
|
46
|
+
private
|
43
47
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
48
|
+
def normalize_string_from_utf8(string,
|
49
|
+
normalize_utf16:,
|
50
|
+
normalize_cp1252:,
|
51
|
+
normalize_newlines:,
|
52
|
+
remove_utf8_bom:,
|
53
|
+
replace_unicode_beyond_ffff:)
|
54
|
+
found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16
|
55
|
+
if found_utf_16
|
56
|
+
string.encode!('UTF-8')
|
57
|
+
else
|
58
|
+
unless string.valid_encoding?
|
59
|
+
if normalize_cp1252
|
60
|
+
cp1252_to_utf_8(string)
|
61
|
+
else
|
62
|
+
raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)'
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
normalize_newlines(string) if normalize_newlines
|
67
|
+
remove_utf8_bom(string) if remove_utf8_bom
|
68
|
+
replace_unicode_beyond_ffff(string) if replace_unicode_beyond_ffff
|
69
|
+
string
|
70
|
+
end
|
53
71
|
|
54
|
-
|
55
|
-
|
56
|
-
|
72
|
+
UTF_16_LE_BOM = "\xFF\xFE"
|
73
|
+
UTF_16_BE_BOM = "\xFE\xFF"
|
74
|
+
UTF_8_BOM = "\xEF\xBB\xBF"
|
75
|
+
|
76
|
+
PRIVATE_CP1252_CHAR_PATTERN = "[\u0080-\u009f]"
|
77
|
+
PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16LE'))
|
78
|
+
PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16BE'))
|
79
|
+
|
80
|
+
# returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16
|
81
|
+
# otherwise returns falsey and leaves the string as is
|
82
|
+
def normalize_utf_16(string, normalize_cp1252:)
|
83
|
+
case string[0, 2]
|
84
|
+
when UTF_16_LE_BOM
|
85
|
+
string.slice!(0, 2) # remove the BOM
|
86
|
+
string.force_encoding('UTF-16LE')
|
87
|
+
normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252
|
88
|
+
true
|
89
|
+
when UTF_16_BE_BOM
|
90
|
+
string.slice!(0, 2) # remove the BOM
|
91
|
+
string.force_encoding('UTF-16BE')
|
92
|
+
normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252
|
93
|
+
true
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def normalize_multibyte_cp1252(string, pattern, encoding)
|
98
|
+
string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) }
|
99
|
+
end
|
57
100
|
|
58
|
-
|
59
|
-
|
60
|
-
|
101
|
+
def normalize_newlines(string)
|
102
|
+
string.gsub!(/ \r\n | \r | \n /x, "\n")
|
103
|
+
end
|
104
|
+
|
105
|
+
def cp1252_to_utf_8(string)
|
106
|
+
string.force_encoding('CP1252')
|
107
|
+
string.encode!(
|
108
|
+
'UTF-8',
|
109
|
+
replace: REPLACE_CHARACTER,
|
110
|
+
undef: :replace,
|
111
|
+
invalid: :replace
|
112
|
+
)
|
113
|
+
end
|
61
114
|
|
62
|
-
|
63
|
-
|
115
|
+
def remove_utf8_bom(string)
|
116
|
+
string.sub!(/\A #{UTF_8_BOM}/x, '')
|
117
|
+
end
|
118
|
+
|
119
|
+
# Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
|
120
|
+
# is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
|
121
|
+
# it would take a data migration and didn't seem that important.
|
122
|
+
|
123
|
+
def replace_unicode_beyond_ffff(string)
|
124
|
+
string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER)
|
125
|
+
end
|
64
126
|
end
|
65
127
|
end
|
66
128
|
end
|
67
129
|
end
|
68
|
-
|
data/lib/invoca/utils/version.rb
CHANGED
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# The same as https://github.com/Invoca/test_overrides/blob/master/lib/constant_overrides.rb,
|
4
|
+
# but less coupled to ExceptionHandling and Invoca microservices.
|
5
|
+
|
6
|
+
module ConstantOverrides
|
7
|
+
def setup_constant_overrides
|
8
|
+
@constant_overrides = []
|
9
|
+
end
|
10
|
+
|
11
|
+
def cleanup_constant_overrides
|
12
|
+
@constant_overrides.reverse.each do |parent_module, k, v|
|
13
|
+
silence_warnings do
|
14
|
+
if v == :never_defined
|
15
|
+
parent_module.send(:remove_const, k)
|
16
|
+
else
|
17
|
+
parent_module.const_set(k, v)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def set_test_const(const_name, value)
|
24
|
+
const_name.is_a?(Symbol) and (const_name = const_name.to_s)
|
25
|
+
const_name.is_a?(String) or raise "Pass the constant name, not its value!"
|
26
|
+
|
27
|
+
final_parent_module = final_const_name = nil
|
28
|
+
original_value =
|
29
|
+
const_name.split('::').reduce(Object) do |parent_module, nested_const_name|
|
30
|
+
parent_module == :never_defined and raise "You need to set each parent constant earlier! #{nested_const_name}"
|
31
|
+
final_parent_module = parent_module
|
32
|
+
final_const_name = nested_const_name
|
33
|
+
begin
|
34
|
+
parent_module.const_get(nested_const_name)
|
35
|
+
rescue
|
36
|
+
:never_defined
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
@constant_overrides << [final_parent_module, final_const_name, original_value]
|
41
|
+
|
42
|
+
silence_warnings { final_parent_module.const_set(final_const_name, value) }
|
43
|
+
end
|
44
|
+
end
|
data/test/test_helper.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require_relative '../../lib/invoca/utils/guaranteed_utf8_string'
|
3
4
|
require_relative '../test_helper'
|
4
5
|
|
5
6
|
class GuaranteedUTF8StringTest < Minitest::Test
|
@@ -18,99 +19,222 @@ class GuaranteedUTF8StringTest < Minitest::Test
|
|
18
19
|
end
|
19
20
|
end
|
20
21
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
context Invoca::Utils::GuaranteedUTF8String do
|
23
|
+
context '.normalize_string' do
|
24
|
+
should 'raise an error if called with an object with no to_s method' do
|
25
|
+
ex = assert_raises ArgumentError do
|
26
|
+
Invoca::Utils::GuaranteedUTF8String.normalize_string(HasNoTo_sMethod.new)
|
27
|
+
end
|
26
28
|
|
27
|
-
|
28
|
-
|
29
|
-
Invoca::Utils::GuaranteedUTF8String.new(BasicObjectWithKernelMethods.new)
|
30
|
-
end
|
31
|
-
end
|
29
|
+
assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/, ex.message)
|
30
|
+
end
|
32
31
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
should 'raise an error if called with a basic Ruby object' do
|
33
|
+
ex = assert_raises ArgumentError do
|
34
|
+
Invoca::Utils::GuaranteedUTF8String.normalize_string(BasicObjectWithKernelMethods.new)
|
35
|
+
end
|
37
36
|
|
38
|
-
|
39
|
-
|
40
|
-
ascii_string = "new string".encode("ASCII")
|
41
|
-
utf8_string_instance = Invoca::Utils::GuaranteedUTF8String.new(ascii_string)
|
42
|
-
encoded_string = utf8_string_instance.to_string
|
37
|
+
assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/, ex.message)
|
38
|
+
end
|
43
39
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
end
|
40
|
+
should 'not mutate the original string' do
|
41
|
+
ascii_string = 'new string'.encode('ASCII')
|
42
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(ascii_string)
|
48
43
|
|
49
|
-
|
50
|
-
|
44
|
+
assert_equal ascii_string, encoded_string
|
45
|
+
assert_equal Encoding::ASCII, ascii_string.encoding
|
46
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
47
|
+
end
|
51
48
|
|
52
|
-
|
49
|
+
should 'return UTF-8 encoded string' do
|
50
|
+
original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
|
53
51
|
|
54
|
-
|
55
|
-
assert_equal Encoding::UTF_8, encoded_string.encoding
|
56
|
-
end
|
52
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string)
|
57
53
|
|
58
|
-
|
59
|
-
|
54
|
+
assert_equal original_string, encoded_string
|
55
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
56
|
+
end
|
60
57
|
|
61
|
-
|
58
|
+
context "normalize_utf16" do
|
59
|
+
UTF16_LE_BOM = "\xFF\xFE"
|
60
|
+
UTF16_BE_BOM = "\xFE\xFF"
|
61
|
+
UTF16_LE_TEST_STRING = (UTF16_LE_BOM + "v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY').freeze
|
62
|
+
UTF16_BE_TEST_STRING = (UTF16_BE_BOM + "\x00v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY').freeze
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
end
|
64
|
+
should 'accept UTF-16LE in BINARY and return UTF-8 encoded string when true' do
|
65
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: true)
|
66
66
|
|
67
|
-
|
68
|
-
|
67
|
+
assert_equal "valid,utf-16\nsecond", encoded_string
|
68
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
69
|
+
end
|
69
70
|
|
70
|
-
|
71
|
+
should 'not check for UTF-16LE in BINARY and return UTF-8 encoded string when false' do
|
72
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: false)
|
73
|
+
expected = "ÿþv\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d\u0000"
|
74
|
+
assert_equal expected, encoded_string
|
75
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
76
|
+
end
|
71
77
|
|
72
|
-
|
73
|
-
|
74
|
-
end
|
78
|
+
should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when true' do
|
79
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: true)
|
75
80
|
|
76
|
-
|
77
|
-
|
78
|
-
|
81
|
+
assert_equal "valid,utf-16\nsecond", encoded_string
|
82
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
83
|
+
end
|
79
84
|
|
80
|
-
|
85
|
+
should 'not check for UTF-16BE in BINARY and return UTF-8 encoded string when false' do
|
86
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: false)
|
87
|
+
expected = "þÿ\u0000v\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d"
|
88
|
+
assert_equal expected, encoded_string
|
89
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
90
|
+
end
|
81
91
|
|
82
|
-
|
83
|
-
|
84
|
-
|
92
|
+
context "containing embedded CP1252" do
|
93
|
+
should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with "private" CP1252 when normalize_utf16: true, normalize_cp1252: false' do
|
94
|
+
original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
|
85
95
|
|
86
|
-
|
87
|
-
string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
|
88
|
-
expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
|
96
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
|
89
97
|
|
90
|
-
|
98
|
+
assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
|
99
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
100
|
+
end
|
91
101
|
|
92
|
-
|
93
|
-
|
94
|
-
|
102
|
+
should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with normalized CP1252 when normalize_utf16: true, normalize_cp1252: true' do
|
103
|
+
original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
|
104
|
+
|
105
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true)
|
106
|
+
|
107
|
+
assert_equal "‘smart quotes’\nsecond", encoded_string
|
108
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
109
|
+
end
|
110
|
+
|
111
|
+
should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: false' do
|
112
|
+
original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
|
113
|
+
|
114
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
|
115
|
+
|
116
|
+
assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
|
117
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
118
|
+
end
|
119
|
+
|
120
|
+
should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: true' do
|
121
|
+
original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
|
122
|
+
|
123
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: true)
|
124
|
+
|
125
|
+
assert_equal "‘smart quotes’\nsecond", encoded_string
|
126
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
context 'normalize_cp1252' do
|
132
|
+
setup do
|
133
|
+
@string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five,\x91smart quotes\x92,\x93suck!\x94\n"
|
134
|
+
end
|
135
|
+
|
136
|
+
should 'raise ArgumentError when false' do
|
137
|
+
assert_raises(ArgumentError, /xxyy/) do
|
138
|
+
Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_cp1252: false)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
should 'return UTF-8 encoded string after falling back to CP1252 encoding when true' do
|
143
|
+
expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~five,‘smart quotes’,“suck!”\n"
|
95
144
|
|
96
|
-
|
97
|
-
all_8_bit_characters = (1..255).map { |char| char.chr }.join
|
145
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string)
|
98
146
|
|
99
|
-
|
100
|
-
|
147
|
+
assert_equal expected_string, encoded_string
|
148
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
149
|
+
end
|
101
150
|
|
102
|
-
|
151
|
+
should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
|
152
|
+
all_8_bit_characters = (1..255).map(&:chr).join
|
153
|
+
|
154
|
+
final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(all_8_bit_characters)
|
155
|
+
expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
156
|
+
|
157
|
+
assert_equal expected_string, final_utf_8_string
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
context 'normalize_newlines' do
|
162
|
+
setup do
|
163
|
+
@string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
|
164
|
+
end
|
165
|
+
|
166
|
+
should 'return UTF-8 encoded string without normalized return chars when false' do
|
167
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: false)
|
168
|
+
|
169
|
+
assert_equal @string, encoded_string
|
170
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
171
|
+
end
|
172
|
+
|
173
|
+
should 'return UTF-8 encoded string with normalized return chars when true' do
|
174
|
+
expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
|
175
|
+
|
176
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: true)
|
177
|
+
|
178
|
+
assert_equal expected_string, encoded_string
|
179
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
context 'remove_utf8_bom' do
|
184
|
+
setup do
|
185
|
+
@original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
|
186
|
+
end
|
187
|
+
|
188
|
+
should 'return UTF-8 encoded string with BOM intact when false' do
|
189
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: false)
|
190
|
+
|
191
|
+
assert_equal "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
|
192
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
193
|
+
end
|
194
|
+
|
195
|
+
should 'return UTF-8 encoded string without BOM when true' do
|
196
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: true)
|
197
|
+
|
198
|
+
assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
|
199
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
context 'replace_unicode_beyond_ffff' do
|
204
|
+
setup do
|
205
|
+
@string = "This string has some ✓ valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
|
206
|
+
end
|
207
|
+
|
208
|
+
should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when false" do
|
209
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: false)
|
210
|
+
|
211
|
+
assert_equal @string, encoded_string
|
212
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
213
|
+
end
|
214
|
+
|
215
|
+
should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when true" do
|
216
|
+
expected_string = 'This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF'
|
217
|
+
|
218
|
+
encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: true)
|
219
|
+
|
220
|
+
assert_equal expected_string, encoded_string
|
221
|
+
assert_equal Encoding::UTF_8, encoded_string.encoding
|
222
|
+
end
|
223
|
+
end
|
103
224
|
end
|
104
225
|
|
105
|
-
|
106
|
-
|
107
|
-
|
226
|
+
context 'constructor' do
|
227
|
+
should 'call normalize_string with the default conversions' do
|
228
|
+
mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
|
108
229
|
|
109
|
-
|
230
|
+
Invoca::Utils::GuaranteedUTF8String.new('').to_string
|
231
|
+
end
|
110
232
|
|
111
|
-
|
112
|
-
|
233
|
+
should 'do the same when using to_s alias' do
|
234
|
+
mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
|
235
|
+
|
236
|
+
Invoca::Utils::GuaranteedUTF8String.new('').to_s
|
237
|
+
end
|
113
238
|
end
|
114
239
|
end
|
115
240
|
end
|
116
|
-
|