invoca-utils 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,68 +1,129 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # This class expects to be initialized with a string and guarantees that the output of the to_string method is in UTF-8 format and fits in 3 bytes/char or less.
3
+ # This class provides a normalize_string method that guarantees that its result is in valid UTF-8
4
+ # format for Ruby and all versions of MySQL (using mb3 storage).
5
+ #
6
+ # [Deprecated] Equivalently, you can also create an instance of this class and call to_string or to_s on it.
4
7
  module Invoca
5
8
  module Utils
6
9
  class GuaranteedUTF8String
7
- def initialize(string)
8
- if string.is_a?(String) ||
9
- (string.respond_to?(:to_s) &&
10
- string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
11
- @string = string.to_s
12
- else
13
- raise ArgumentError, "#{self.class} must be initialized with a string or an object with a non-Kernel .to_s method but instead was #{string.class} #{string.inspect}"
14
- end
15
- end
10
+ attr_reader :to_string
16
11
 
17
- def to_string
18
- @to_string ||= normalize_string(@string)
12
+ def initialize(string)
13
+ @to_string = self.class.normalize_string(string)
19
14
  end
20
15
 
21
- alias_method :to_s, :to_string
16
+ alias to_s to_string
22
17
 
23
18
  private
24
19
 
25
20
  # chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON
26
- REPLACE_CHARACTER = '~' unless defined?(REPLACE_CHARACTER)
21
+ REPLACE_CHARACTER = '~'
27
22
 
28
- def normalize_string(str)
29
- str = @string.dup
30
- str.force_encoding('UTF-8')
31
- if !str.valid_encoding?
32
- cp1252_to_utf_8(str)
23
+ class << self
24
+ def normalize_string(orig_string,
25
+ normalize_utf16: true,
26
+ normalize_cp1252: true,
27
+ normalize_newlines: true,
28
+ remove_utf8_bom: true,
29
+ replace_unicode_beyond_ffff: true)
30
+ string = if orig_string.is_a?(String) ||
31
+ (orig_string.respond_to?(:to_s) &&
32
+ orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
33
+ orig_string.to_s.dup
34
+ else
35
+ raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}"
36
+ end
37
+ string.force_encoding('UTF-8')
38
+ normalize_string_from_utf8(string,
39
+ normalize_utf16: normalize_utf16,
40
+ normalize_cp1252: normalize_cp1252,
41
+ normalize_newlines: normalize_newlines,
42
+ remove_utf8_bom: remove_utf8_bom,
43
+ replace_unicode_beyond_ffff: replace_unicode_beyond_ffff)
33
44
  end
34
- normalize_newlines(str)
35
- remove_bom(str)
36
- replace_unicode_beyond_ffff(str)
37
- str
38
- end
39
45
 
40
- def normalize_newlines(str)
41
- str.gsub!(/ \r\n | \r | \n /x, "\n")
42
- end
46
+ private
43
47
 
44
- def cp1252_to_utf_8(str)
45
- str.force_encoding('CP1252')
46
- str.encode!(
47
- 'UTF-8',
48
- replace: REPLACE_CHARACTER,
49
- undef: :replace,
50
- invalid: :replace
51
- )
52
- end
48
+ def normalize_string_from_utf8(string,
49
+ normalize_utf16:,
50
+ normalize_cp1252:,
51
+ normalize_newlines:,
52
+ remove_utf8_bom:,
53
+ replace_unicode_beyond_ffff:)
54
+ found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16
55
+ if found_utf_16
56
+ string.encode!('UTF-8')
57
+ else
58
+ unless string.valid_encoding?
59
+ if normalize_cp1252
60
+ cp1252_to_utf_8(string)
61
+ else
62
+ raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)'
63
+ end
64
+ end
65
+ end
66
+ normalize_newlines(string) if normalize_newlines
67
+ remove_utf8_bom(string) if remove_utf8_bom
68
+ replace_unicode_beyond_ffff(string) if replace_unicode_beyond_ffff
69
+ string
70
+ end
53
71
 
54
- def remove_bom(str)
55
- str.sub!(/\A \xEF\xBB\xBF/x, '')
56
- end
72
+ UTF_16_LE_BOM = "\xFF\xFE"
73
+ UTF_16_BE_BOM = "\xFE\xFF"
74
+ UTF_8_BOM = "\xEF\xBB\xBF"
75
+
76
+ PRIVATE_CP1252_CHAR_PATTERN = "[\u0080-\u009f]"
77
+ PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16LE'))
78
+ PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16BE'))
79
+
80
+ # returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16
81
+ # otherwise returns falsey and leaves the string as is
82
+ def normalize_utf_16(string, normalize_cp1252:)
83
+ case string[0, 2]
84
+ when UTF_16_LE_BOM
85
+ string.slice!(0, 2) # remove the BOM
86
+ string.force_encoding('UTF-16LE')
87
+ normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252
88
+ true
89
+ when UTF_16_BE_BOM
90
+ string.slice!(0, 2) # remove the BOM
91
+ string.force_encoding('UTF-16BE')
92
+ normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252
93
+ true
94
+ end
95
+ end
96
+
97
+ def normalize_multibyte_cp1252(string, pattern, encoding)
98
+ string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) }
99
+ end
57
100
 
58
- # Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
59
- # is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
60
- # it would take a data migration and didn't seem that important.
101
+ def normalize_newlines(string)
102
+ string.gsub!(/ \r\n | \r | \n /x, "\n")
103
+ end
104
+
105
+ def cp1252_to_utf_8(string)
106
+ string.force_encoding('CP1252')
107
+ string.encode!(
108
+ 'UTF-8',
109
+ replace: REPLACE_CHARACTER,
110
+ undef: :replace,
111
+ invalid: :replace
112
+ )
113
+ end
61
114
 
62
- def replace_unicode_beyond_ffff(str)
63
- str.gsub!(/[^\u{0}-\u{ffff}]/x, REPLACE_CHARACTER)
115
+ def remove_utf8_bom(string)
116
+ string.sub!(/\A #{UTF_8_BOM}/x, '')
117
+ end
118
+
119
+ # Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
120
+ # is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
121
+ # it would take a data migration and didn't seem that important.
122
+
123
+ def replace_unicode_beyond_ffff(string)
124
+ string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER)
125
+ end
64
126
  end
65
127
  end
66
128
  end
67
129
  end
68
-
@@ -1,5 +1,5 @@
1
1
  module Invoca
2
2
  module Utils
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ # The same as https://github.com/Invoca/test_overrides/blob/master/lib/constant_overrides.rb,
4
+ # but less coupled to ExceptionHandling and Invoca microservices.
5
+
6
+ module ConstantOverrides
7
+ def setup_constant_overrides
8
+ @constant_overrides = []
9
+ end
10
+
11
+ def cleanup_constant_overrides
12
+ @constant_overrides.reverse.each do |parent_module, k, v|
13
+ silence_warnings do
14
+ if v == :never_defined
15
+ parent_module.send(:remove_const, k)
16
+ else
17
+ parent_module.const_set(k, v)
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ def set_test_const(const_name, value)
24
+ const_name.is_a?(Symbol) and (const_name = const_name.to_s)
25
+ const_name.is_a?(String) or raise "Pass the constant name, not its value!"
26
+
27
+ final_parent_module = final_const_name = nil
28
+ original_value =
29
+ const_name.split('::').reduce(Object) do |parent_module, nested_const_name|
30
+ parent_module == :never_defined and raise "You need to set each parent constant earlier! #{nested_const_name}"
31
+ final_parent_module = parent_module
32
+ final_const_name = nested_const_name
33
+ begin
34
+ parent_module.const_get(nested_const_name)
35
+ rescue
36
+ :never_defined
37
+ end
38
+ end
39
+
40
+ @constant_overrides << [final_parent_module, final_const_name, original_value]
41
+
42
+ silence_warnings { final_parent_module.const_set(final_const_name, value) }
43
+ end
44
+ end
data/test/test_helper.rb CHANGED
@@ -3,3 +3,6 @@ require "minitest/autorun"
3
3
  require 'rr'
4
4
  require 'shoulda'
5
5
  require 'pry'
6
+ require 'active_support/all'
7
+
8
+ require 'invoca/utils'
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative '../../lib/invoca/utils/guaranteed_utf8_string'
3
4
  require_relative '../test_helper'
4
5
 
5
6
  class GuaranteedUTF8StringTest < Minitest::Test
@@ -18,99 +19,222 @@ class GuaranteedUTF8StringTest < Minitest::Test
18
19
  end
19
20
  end
20
21
 
21
- should "raise an error if initialized with an object with no to_s method" do
22
- assert_raises ArgumentError, /GuaranteedUTF8String must be initialized with a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/ do
23
- Invoca::Utils::GuaranteedUTF8String.new(HasNoTo_sMethod.new)
24
- end
25
- end
22
+ context Invoca::Utils::GuaranteedUTF8String do
23
+ context '.normalize_string' do
24
+ should 'raise an error if called with an object with no to_s method' do
25
+ ex = assert_raises ArgumentError do
26
+ Invoca::Utils::GuaranteedUTF8String.normalize_string(HasNoTo_sMethod.new)
27
+ end
26
28
 
27
- should "raise an error if initialized with a basic Ruby object" do
28
- assert_raises ArgumentError, /GuaranteedUTF8String must be initialized with a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/ do
29
- Invoca::Utils::GuaranteedUTF8String.new(BasicObjectWithKernelMethods.new)
30
- end
31
- end
29
+ assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/, ex.message)
30
+ end
32
31
 
33
- should "convert to a string with to_s if possible" do
34
- result = Invoca::Utils::GuaranteedUTF8String.new(ConvertibleToString.new("test string"))
35
- assert_equal "test string", result.to_string
36
- end
32
+ should 'raise an error if called with a basic Ruby object' do
33
+ ex = assert_raises ArgumentError do
34
+ Invoca::Utils::GuaranteedUTF8String.normalize_string(BasicObjectWithKernelMethods.new)
35
+ end
37
36
 
38
- context "#to_string" do
39
- should "not mutate the original string" do
40
- ascii_string = "new string".encode("ASCII")
41
- utf8_string_instance = Invoca::Utils::GuaranteedUTF8String.new(ascii_string)
42
- encoded_string = utf8_string_instance.to_string
37
+ assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/, ex.message)
38
+ end
43
39
 
44
- assert_equal ascii_string, encoded_string
45
- assert_equal Encoding::ASCII, ascii_string.encoding
46
- assert_equal Encoding::UTF_8, encoded_string.encoding
47
- end
40
+ should 'not mutate the original string' do
41
+ ascii_string = 'new string'.encode('ASCII')
42
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(ascii_string)
48
43
 
49
- should "return UTF-8 encoded string" do
50
- original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
44
+ assert_equal ascii_string, encoded_string
45
+ assert_equal Encoding::ASCII, ascii_string.encoding
46
+ assert_equal Encoding::UTF_8, encoded_string.encoding
47
+ end
51
48
 
52
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_string
49
+ should 'return UTF-8 encoded string' do
50
+ original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
53
51
 
54
- assert_equal original_string, encoded_string
55
- assert_equal Encoding::UTF_8, encoded_string.encoding
56
- end
52
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string)
57
53
 
58
- should "return UTF-8 encoded string without BOM" do
59
- original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
54
+ assert_equal original_string, encoded_string
55
+ assert_equal Encoding::UTF_8, encoded_string.encoding
56
+ end
60
57
 
61
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_string
58
+ context "normalize_utf16" do
59
+ UTF16_LE_BOM = "\xFF\xFE"
60
+ UTF16_BE_BOM = "\xFE\xFF"
61
+ UTF16_LE_TEST_STRING = (UTF16_LE_BOM + "v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY').freeze
62
+ UTF16_BE_TEST_STRING = (UTF16_BE_BOM + "\x00v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY').freeze
62
63
 
63
- assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
64
- assert_equal Encoding::UTF_8, encoded_string.encoding
65
- end
64
+ should 'accept UTF-16LE in BINARY and return UTF-8 encoded string when true' do
65
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: true)
66
66
 
67
- should "return UTF-8 encoded string using to_s alias" do
68
- original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
67
+ assert_equal "valid,utf-16\nsecond", encoded_string
68
+ assert_equal Encoding::UTF_8, encoded_string.encoding
69
+ end
69
70
 
70
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_s
71
+ should 'not check for UTF-16LE in BINARY and return UTF-8 encoded string when false' do
72
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: false)
73
+ expected = "ÿþv\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d\u0000"
74
+ assert_equal expected, encoded_string
75
+ assert_equal Encoding::UTF_8, encoded_string.encoding
76
+ end
71
77
 
72
- assert_equal original_string, encoded_string
73
- assert_equal Encoding::UTF_8, encoded_string.encoding
74
- end
78
+ should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when true' do
79
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: true)
75
80
 
76
- should "return UTF-8 encoded string after falling back to CP1252 encoding" do
77
- string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five\xF6,six,seven,eight\n"
78
- expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~fiveö,six,seven,eight\n"
81
+ assert_equal "valid,utf-16\nsecond", encoded_string
82
+ assert_equal Encoding::UTF_8, encoded_string.encoding
83
+ end
79
84
 
80
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
85
+ should 'not check for UTF-16BE in BINARY and return UTF-8 encoded string when false' do
86
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: false)
87
+ expected = "þÿ\u0000v\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d"
88
+ assert_equal expected, encoded_string
89
+ assert_equal Encoding::UTF_8, encoded_string.encoding
90
+ end
81
91
 
82
- assert_equal expected_string, encoded_string
83
- assert_equal Encoding::UTF_8, encoded_string.encoding
84
- end
92
+ context "containing embedded CP1252" do
93
+ should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with "private" CP1252 when normalize_utf16: true, normalize_cp1252: false' do
94
+ original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
85
95
 
86
- should "return UTF-8 encoded string with normalized return chars" do
87
- string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
88
- expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
96
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
89
97
 
90
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
98
+ assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
99
+ assert_equal Encoding::UTF_8, encoded_string.encoding
100
+ end
91
101
 
92
- assert_equal expected_string, encoded_string
93
- assert_equal Encoding::UTF_8, encoded_string.encoding
94
- end
102
+ should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with normalized CP1252 when normalize_utf16: true, normalize_cp1252: true' do
103
+ original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
104
+
105
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true)
106
+
107
+ assert_equal "‘smart quotes’\nsecond", encoded_string
108
+ assert_equal Encoding::UTF_8, encoded_string.encoding
109
+ end
110
+
111
+ should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: false' do
112
+ original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
113
+
114
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
115
+
116
+ assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
117
+ assert_equal Encoding::UTF_8, encoded_string.encoding
118
+ end
119
+
120
+ should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: true' do
121
+ original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
122
+
123
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: true)
124
+
125
+ assert_equal "‘smart quotes’\nsecond", encoded_string
126
+ assert_equal Encoding::UTF_8, encoded_string.encoding
127
+ end
128
+ end
129
+ end
130
+
131
+ context 'normalize_cp1252' do
132
+ setup do
133
+ @string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five,\x91smart quotes\x92,\x93suck!\x94\n"
134
+ end
135
+
136
+ should 'raise ArgumentError when false' do
137
+ assert_raises(ArgumentError, /xxyy/) do
138
+ Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_cp1252: false)
139
+ end
140
+ end
141
+
142
+ should 'return UTF-8 encoded string after falling back to CP1252 encoding when true' do
143
+ expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~five,‘smart quotes’,“suck!”\n"
95
144
 
96
- should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
97
- all_8_bit_characters = (1..255).map { |char| char.chr }.join
145
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string)
98
146
 
99
- final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.new(all_8_bit_characters.dup).to_s
100
- expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
147
+ assert_equal expected_string, encoded_string
148
+ assert_equal Encoding::UTF_8, encoded_string.encoding
149
+ end
101
150
 
102
- assert_equal expected_string, final_utf_8_string
151
+ should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
152
+ all_8_bit_characters = (1..255).map(&:chr).join
153
+
154
+ final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(all_8_bit_characters)
155
+ expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
156
+
157
+ assert_equal expected_string, final_utf_8_string
158
+ end
159
+ end
160
+
161
+ context 'normalize_newlines' do
162
+ setup do
163
+ @string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
164
+ end
165
+
166
+ should 'return UTF-8 encoded string without normalized return chars when false' do
167
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: false)
168
+
169
+ assert_equal @string, encoded_string
170
+ assert_equal Encoding::UTF_8, encoded_string.encoding
171
+ end
172
+
173
+ should 'return UTF-8 encoded string with normalized return chars when true' do
174
+ expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
175
+
176
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: true)
177
+
178
+ assert_equal expected_string, encoded_string
179
+ assert_equal Encoding::UTF_8, encoded_string.encoding
180
+ end
181
+ end
182
+
183
+ context 'remove_utf8_bom' do
184
+ setup do
185
+ @original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
186
+ end
187
+
188
+ should 'return UTF-8 encoded string with BOM intact when false' do
189
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: false)
190
+
191
+ assert_equal "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
192
+ assert_equal Encoding::UTF_8, encoded_string.encoding
193
+ end
194
+
195
+ should 'return UTF-8 encoded string without BOM when true' do
196
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: true)
197
+
198
+ assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
199
+ assert_equal Encoding::UTF_8, encoded_string.encoding
200
+ end
201
+ end
202
+
203
+ context 'replace_unicode_beyond_ffff' do
204
+ setup do
205
+ @string = "This string has some ✓ valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
206
+ end
207
+
208
+ should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when false" do
209
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: false)
210
+
211
+ assert_equal @string, encoded_string
212
+ assert_equal Encoding::UTF_8, encoded_string.encoding
213
+ end
214
+
215
+ should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when true" do
216
+ expected_string = 'This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF'
217
+
218
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: true)
219
+
220
+ assert_equal expected_string, encoded_string
221
+ assert_equal Encoding::UTF_8, encoded_string.encoding
222
+ end
223
+ end
103
224
  end
104
225
 
105
- should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~" do
106
- string = "This string has some valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
107
- expected_string = "This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF"
226
+ context 'constructor' do
227
+ should 'call normalize_string with the default conversions' do
228
+ mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
108
229
 
109
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
230
+ Invoca::Utils::GuaranteedUTF8String.new('').to_string
231
+ end
110
232
 
111
- assert_equal expected_string, encoded_string
112
- assert_equal Encoding::UTF_8, encoded_string.encoding
233
+ should 'do the same when using to_s alias' do
234
+ mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
235
+
236
+ Invoca::Utils::GuaranteedUTF8String.new('').to_s
237
+ end
113
238
  end
114
239
  end
115
240
  end
116
-