invoca-utils 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,68 +1,129 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- # This class expects to be initialized with a string and guarantees that the output of the to_string method is in UTF-8 format and fits in 3 bytes/char or less.
3
+ # This class provides a normalize_string method that guarantees that its result is in valid UTF-8
4
+ # format for Ruby and all versions of MySQL (using mb3 storage).
5
+ #
6
+ # [Deprecated] Equivalently, you can also create an instance of this class and call to_string or to_s on it.
4
7
  module Invoca
5
8
  module Utils
6
9
  class GuaranteedUTF8String
7
- def initialize(string)
8
- if string.is_a?(String) ||
9
- (string.respond_to?(:to_s) &&
10
- string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
11
- @string = string.to_s
12
- else
13
- raise ArgumentError, "#{self.class} must be initialized with a string or an object with a non-Kernel .to_s method but instead was #{string.class} #{string.inspect}"
14
- end
15
- end
10
+ attr_reader :to_string
16
11
 
17
- def to_string
18
- @to_string ||= normalize_string(@string)
12
+ def initialize(string)
13
+ @to_string = self.class.normalize_string(string)
19
14
  end
20
15
 
21
- alias_method :to_s, :to_string
16
+ alias to_s to_string
22
17
 
23
18
  private
24
19
 
25
20
  # chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON
26
- REPLACE_CHARACTER = '~' unless defined?(REPLACE_CHARACTER)
21
+ REPLACE_CHARACTER = '~'
27
22
 
28
- def normalize_string(str)
29
- str = @string.dup
30
- str.force_encoding('UTF-8')
31
- if !str.valid_encoding?
32
- cp1252_to_utf_8(str)
23
+ class << self
24
+ def normalize_string(orig_string,
25
+ normalize_utf16: true,
26
+ normalize_cp1252: true,
27
+ normalize_newlines: true,
28
+ remove_utf8_bom: true,
29
+ replace_unicode_beyond_ffff: true)
30
+ string = if orig_string.is_a?(String) ||
31
+ (orig_string.respond_to?(:to_s) &&
32
+ orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
33
+ orig_string.to_s.dup
34
+ else
35
+ raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}"
36
+ end
37
+ string.force_encoding('UTF-8')
38
+ normalize_string_from_utf8(string,
39
+ normalize_utf16: normalize_utf16,
40
+ normalize_cp1252: normalize_cp1252,
41
+ normalize_newlines: normalize_newlines,
42
+ remove_utf8_bom: remove_utf8_bom,
43
+ replace_unicode_beyond_ffff: replace_unicode_beyond_ffff)
33
44
  end
34
- normalize_newlines(str)
35
- remove_bom(str)
36
- replace_unicode_beyond_ffff(str)
37
- str
38
- end
39
45
 
40
- def normalize_newlines(str)
41
- str.gsub!(/ \r\n | \r | \n /x, "\n")
42
- end
46
+ private
43
47
 
44
- def cp1252_to_utf_8(str)
45
- str.force_encoding('CP1252')
46
- str.encode!(
47
- 'UTF-8',
48
- replace: REPLACE_CHARACTER,
49
- undef: :replace,
50
- invalid: :replace
51
- )
52
- end
48
+ def normalize_string_from_utf8(string,
49
+ normalize_utf16:,
50
+ normalize_cp1252:,
51
+ normalize_newlines:,
52
+ remove_utf8_bom:,
53
+ replace_unicode_beyond_ffff:)
54
+ found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16
55
+ if found_utf_16
56
+ string.encode!('UTF-8')
57
+ else
58
+ unless string.valid_encoding?
59
+ if normalize_cp1252
60
+ cp1252_to_utf_8(string)
61
+ else
62
+ raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)'
63
+ end
64
+ end
65
+ end
66
+ normalize_newlines(string) if normalize_newlines
67
+ remove_utf8_bom(string) if remove_utf8_bom
68
+ replace_unicode_beyond_ffff(string) if replace_unicode_beyond_ffff
69
+ string
70
+ end
53
71
 
54
- def remove_bom(str)
55
- str.sub!(/\A \xEF\xBB\xBF/x, '')
56
- end
72
+ UTF_16_LE_BOM = "\xFF\xFE"
73
+ UTF_16_BE_BOM = "\xFE\xFF"
74
+ UTF_8_BOM = "\xEF\xBB\xBF"
75
+
76
+ PRIVATE_CP1252_CHAR_PATTERN = "[\u0080-\u009f]"
77
+ PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16LE'))
78
+ PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16BE'))
79
+
80
+ # returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16
81
+ # otherwise returns falsey and leaves the string as is
82
+ def normalize_utf_16(string, normalize_cp1252:)
83
+ case string[0, 2]
84
+ when UTF_16_LE_BOM
85
+ string.slice!(0, 2) # remove the BOM
86
+ string.force_encoding('UTF-16LE')
87
+ normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252
88
+ true
89
+ when UTF_16_BE_BOM
90
+ string.slice!(0, 2) # remove the BOM
91
+ string.force_encoding('UTF-16BE')
92
+ normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252
93
+ true
94
+ end
95
+ end
96
+
97
+ def normalize_multibyte_cp1252(string, pattern, encoding)
98
+ string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) }
99
+ end
57
100
 
58
- # Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
59
- # is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
60
- # it would take a data migration and didn't seem that important.
101
+ def normalize_newlines(string)
102
+ string.gsub!(/ \r\n | \r | \n /x, "\n")
103
+ end
104
+
105
+ def cp1252_to_utf_8(string)
106
+ string.force_encoding('CP1252')
107
+ string.encode!(
108
+ 'UTF-8',
109
+ replace: REPLACE_CHARACTER,
110
+ undef: :replace,
111
+ invalid: :replace
112
+ )
113
+ end
61
114
 
62
- def replace_unicode_beyond_ffff(str)
63
- str.gsub!(/[^\u{0}-\u{ffff}]/x, REPLACE_CHARACTER)
115
+ def remove_utf8_bom(string)
116
+ string.sub!(/\A #{UTF_8_BOM}/x, '')
117
+ end
118
+
119
+ # Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
120
+ # is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
121
+ # it would take a data migration and didn't seem that important.
122
+
123
+ def replace_unicode_beyond_ffff(string)
124
+ string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER)
125
+ end
64
126
  end
65
127
  end
66
128
  end
67
129
  end
68
-
@@ -1,5 +1,5 @@
1
1
  module Invoca
2
2
  module Utils
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ # The same as https://github.com/Invoca/test_overrides/blob/master/lib/constant_overrides.rb,
4
+ # but less coupled to ExceptionHandling and Invoca microservices.
5
+
6
+ module ConstantOverrides
7
+ def setup_constant_overrides
8
+ @constant_overrides = []
9
+ end
10
+
11
+ def cleanup_constant_overrides
12
+ @constant_overrides.reverse.each do |parent_module, k, v|
13
+ silence_warnings do
14
+ if v == :never_defined
15
+ parent_module.send(:remove_const, k)
16
+ else
17
+ parent_module.const_set(k, v)
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ def set_test_const(const_name, value)
24
+ const_name.is_a?(Symbol) and (const_name = const_name.to_s)
25
+ const_name.is_a?(String) or raise "Pass the constant name, not its value!"
26
+
27
+ final_parent_module = final_const_name = nil
28
+ original_value =
29
+ const_name.split('::').reduce(Object) do |parent_module, nested_const_name|
30
+ parent_module == :never_defined and raise "You need to set each parent constant earlier! #{nested_const_name}"
31
+ final_parent_module = parent_module
32
+ final_const_name = nested_const_name
33
+ begin
34
+ parent_module.const_get(nested_const_name)
35
+ rescue
36
+ :never_defined
37
+ end
38
+ end
39
+
40
+ @constant_overrides << [final_parent_module, final_const_name, original_value]
41
+
42
+ silence_warnings { final_parent_module.const_set(final_const_name, value) }
43
+ end
44
+ end
data/test/test_helper.rb CHANGED
@@ -3,3 +3,6 @@ require "minitest/autorun"
3
3
  require 'rr'
4
4
  require 'shoulda'
5
5
  require 'pry'
6
+ require 'active_support/all'
7
+
8
+ require 'invoca/utils'
@@ -1,5 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require_relative '../../lib/invoca/utils/guaranteed_utf8_string'
3
4
  require_relative '../test_helper'
4
5
 
5
6
  class GuaranteedUTF8StringTest < Minitest::Test
@@ -18,99 +19,222 @@ class GuaranteedUTF8StringTest < Minitest::Test
18
19
  end
19
20
  end
20
21
 
21
- should "raise an error if initialized with an object with no to_s method" do
22
- assert_raises ArgumentError, /GuaranteedUTF8String must be initialized with a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/ do
23
- Invoca::Utils::GuaranteedUTF8String.new(HasNoTo_sMethod.new)
24
- end
25
- end
22
+ context Invoca::Utils::GuaranteedUTF8String do
23
+ context '.normalize_string' do
24
+ should 'raise an error if called with an object with no to_s method' do
25
+ ex = assert_raises ArgumentError do
26
+ Invoca::Utils::GuaranteedUTF8String.normalize_string(HasNoTo_sMethod.new)
27
+ end
26
28
 
27
- should "raise an error if initialized with a basic Ruby object" do
28
- assert_raises ArgumentError, /GuaranteedUTF8String must be initialized with a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/ do
29
- Invoca::Utils::GuaranteedUTF8String.new(BasicObjectWithKernelMethods.new)
30
- end
31
- end
29
+ assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/, ex.message)
30
+ end
32
31
 
33
- should "convert to a string with to_s if possible" do
34
- result = Invoca::Utils::GuaranteedUTF8String.new(ConvertibleToString.new("test string"))
35
- assert_equal "test string", result.to_string
36
- end
32
+ should 'raise an error if called with a basic Ruby object' do
33
+ ex = assert_raises ArgumentError do
34
+ Invoca::Utils::GuaranteedUTF8String.normalize_string(BasicObjectWithKernelMethods.new)
35
+ end
37
36
 
38
- context "#to_string" do
39
- should "not mutate the original string" do
40
- ascii_string = "new string".encode("ASCII")
41
- utf8_string_instance = Invoca::Utils::GuaranteedUTF8String.new(ascii_string)
42
- encoded_string = utf8_string_instance.to_string
37
+ assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/, ex.message)
38
+ end
43
39
 
44
- assert_equal ascii_string, encoded_string
45
- assert_equal Encoding::ASCII, ascii_string.encoding
46
- assert_equal Encoding::UTF_8, encoded_string.encoding
47
- end
40
+ should 'not mutate the original string' do
41
+ ascii_string = 'new string'.encode('ASCII')
42
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(ascii_string)
48
43
 
49
- should "return UTF-8 encoded string" do
50
- original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
44
+ assert_equal ascii_string, encoded_string
45
+ assert_equal Encoding::ASCII, ascii_string.encoding
46
+ assert_equal Encoding::UTF_8, encoded_string.encoding
47
+ end
51
48
 
52
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_string
49
+ should 'return UTF-8 encoded string' do
50
+ original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
53
51
 
54
- assert_equal original_string, encoded_string
55
- assert_equal Encoding::UTF_8, encoded_string.encoding
56
- end
52
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string)
57
53
 
58
- should "return UTF-8 encoded string without BOM" do
59
- original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
54
+ assert_equal original_string, encoded_string
55
+ assert_equal Encoding::UTF_8, encoded_string.encoding
56
+ end
60
57
 
61
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_string
58
+ context "normalize_utf16" do
59
+ UTF16_LE_BOM = "\xFF\xFE"
60
+ UTF16_BE_BOM = "\xFE\xFF"
61
+ UTF16_LE_TEST_STRING = (UTF16_LE_BOM + "v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY').freeze
62
+ UTF16_BE_TEST_STRING = (UTF16_BE_BOM + "\x00v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY').freeze
62
63
 
63
- assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
64
- assert_equal Encoding::UTF_8, encoded_string.encoding
65
- end
64
+ should 'accept UTF-16LE in BINARY and return UTF-8 encoded string when true' do
65
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: true)
66
66
 
67
- should "return UTF-8 encoded string using to_s alias" do
68
- original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
67
+ assert_equal "valid,utf-16\nsecond", encoded_string
68
+ assert_equal Encoding::UTF_8, encoded_string.encoding
69
+ end
69
70
 
70
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_s
71
+ should 'not check for UTF-16LE in BINARY and return UTF-8 encoded string when false' do
72
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: false)
73
+ expected = "ÿþv\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d\u0000"
74
+ assert_equal expected, encoded_string
75
+ assert_equal Encoding::UTF_8, encoded_string.encoding
76
+ end
71
77
 
72
- assert_equal original_string, encoded_string
73
- assert_equal Encoding::UTF_8, encoded_string.encoding
74
- end
78
+ should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when true' do
79
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: true)
75
80
 
76
- should "return UTF-8 encoded string after falling back to CP1252 encoding" do
77
- string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five\xF6,six,seven,eight\n"
78
- expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~fiveö,six,seven,eight\n"
81
+ assert_equal "valid,utf-16\nsecond", encoded_string
82
+ assert_equal Encoding::UTF_8, encoded_string.encoding
83
+ end
79
84
 
80
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
85
+ should 'not check for UTF-16BE in BINARY and return UTF-8 encoded string when false' do
86
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: false)
87
+ expected = "þÿ\u0000v\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d"
88
+ assert_equal expected, encoded_string
89
+ assert_equal Encoding::UTF_8, encoded_string.encoding
90
+ end
81
91
 
82
- assert_equal expected_string, encoded_string
83
- assert_equal Encoding::UTF_8, encoded_string.encoding
84
- end
92
+ context "containing embedded CP1252" do
93
+ should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with "private" CP1252 when normalize_utf16: true, normalize_cp1252: false' do
94
+ original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
85
95
 
86
- should "return UTF-8 encoded string with normalized return chars" do
87
- string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
88
- expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
96
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
89
97
 
90
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
98
+ assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
99
+ assert_equal Encoding::UTF_8, encoded_string.encoding
100
+ end
91
101
 
92
- assert_equal expected_string, encoded_string
93
- assert_equal Encoding::UTF_8, encoded_string.encoding
94
- end
102
+ should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with normalized CP1252 when normalize_utf16: true, normalize_cp1252: true' do
103
+ original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
104
+
105
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true)
106
+
107
+ assert_equal "‘smart quotes’\nsecond", encoded_string
108
+ assert_equal Encoding::UTF_8, encoded_string.encoding
109
+ end
110
+
111
+ should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: false' do
112
+ original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
113
+
114
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
115
+
116
+ assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
117
+ assert_equal Encoding::UTF_8, encoded_string.encoding
118
+ end
119
+
120
+ should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: true' do
121
+ original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
122
+
123
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: true)
124
+
125
+ assert_equal "‘smart quotes’\nsecond", encoded_string
126
+ assert_equal Encoding::UTF_8, encoded_string.encoding
127
+ end
128
+ end
129
+ end
130
+
131
+ context 'normalize_cp1252' do
132
+ setup do
133
+ @string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five,\x91smart quotes\x92,\x93suck!\x94\n"
134
+ end
135
+
136
+ should 'raise ArgumentError when false' do
137
+ assert_raises(ArgumentError, /xxyy/) do
138
+ Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_cp1252: false)
139
+ end
140
+ end
141
+
142
+ should 'return UTF-8 encoded string after falling back to CP1252 encoding when true' do
143
+ expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~five,‘smart quotes’,“suck!”\n"
95
144
 
96
- should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
97
- all_8_bit_characters = (1..255).map { |char| char.chr }.join
145
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string)
98
146
 
99
- final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.new(all_8_bit_characters.dup).to_s
100
- expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
147
+ assert_equal expected_string, encoded_string
148
+ assert_equal Encoding::UTF_8, encoded_string.encoding
149
+ end
101
150
 
102
- assert_equal expected_string, final_utf_8_string
151
+ should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
152
+ all_8_bit_characters = (1..255).map(&:chr).join
153
+
154
+ final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(all_8_bit_characters)
155
+ expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
156
+
157
+ assert_equal expected_string, final_utf_8_string
158
+ end
159
+ end
160
+
161
+ context 'normalize_newlines' do
162
+ setup do
163
+ @string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
164
+ end
165
+
166
+ should 'return UTF-8 encoded string without normalized return chars when false' do
167
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: false)
168
+
169
+ assert_equal @string, encoded_string
170
+ assert_equal Encoding::UTF_8, encoded_string.encoding
171
+ end
172
+
173
+ should 'return UTF-8 encoded string with normalized return chars when true' do
174
+ expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
175
+
176
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: true)
177
+
178
+ assert_equal expected_string, encoded_string
179
+ assert_equal Encoding::UTF_8, encoded_string.encoding
180
+ end
181
+ end
182
+
183
+ context 'remove_utf8_bom' do
184
+ setup do
185
+ @original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
186
+ end
187
+
188
+ should 'return UTF-8 encoded string with BOM intact when false' do
189
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: false)
190
+
191
+ assert_equal "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
192
+ assert_equal Encoding::UTF_8, encoded_string.encoding
193
+ end
194
+
195
+ should 'return UTF-8 encoded string without BOM when true' do
196
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: true)
197
+
198
+ assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
199
+ assert_equal Encoding::UTF_8, encoded_string.encoding
200
+ end
201
+ end
202
+
203
+ context 'replace_unicode_beyond_ffff' do
204
+ setup do
205
+ @string = "This string has some ✓ valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
206
+ end
207
+
208
+ should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when false" do
209
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: false)
210
+
211
+ assert_equal @string, encoded_string
212
+ assert_equal Encoding::UTF_8, encoded_string.encoding
213
+ end
214
+
215
+ should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when true" do
216
+ expected_string = 'This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF'
217
+
218
+ encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: true)
219
+
220
+ assert_equal expected_string, encoded_string
221
+ assert_equal Encoding::UTF_8, encoded_string.encoding
222
+ end
223
+ end
103
224
  end
104
225
 
105
- should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~" do
106
- string = "This string has some valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
107
- expected_string = "This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF"
226
+ context 'constructor' do
227
+ should 'call normalize_string with the default conversions' do
228
+ mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
108
229
 
109
- encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
230
+ Invoca::Utils::GuaranteedUTF8String.new('').to_string
231
+ end
110
232
 
111
- assert_equal expected_string, encoded_string
112
- assert_equal Encoding::UTF_8, encoded_string.encoding
233
+ should 'do the same when using to_s alias' do
234
+ mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
235
+
236
+ Invoca::Utils::GuaranteedUTF8String.new('').to_s
237
+ end
113
238
  end
114
239
  end
115
240
  end
116
-