RubyGems - invoca-utils - Versions diffs - 0.0.3 → 0.0.4 - Mend

invoca-utils 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.rubocop.yml +5 -0
data/Gemfile +11 -1
data/Gemfile.lock +2 -2
data/invoca-utils.gemspec +0 -11
data/lib/invoca/utils.rb +8 -0
data/lib/invoca/utils/diff.rb +290 -282
data/lib/invoca/utils/guaranteed_utf8_string.rb +106 -45
data/lib/invoca/utils/version.rb +1 -1
data/test/helpers/constant_overrides.rb +44 -0
data/test/test_helper.rb +3 -0
data/test/unit/guaranteed_utf8_string_test.rb +191 -67
data/test/unit/time_calculations_test.rb +0 -1
data/test/unit/utils_test.rb +45 -2
metadata +6 -129

data/lib/invoca/utils/guaranteed_utf8_string.rb CHANGED Viewed

@@ -1,68 +1,129 @@
 # frozen_string_literal: true
-# This class expects to be initialized with a string and guarantees that the output of the to_string method is in UTF-8 format and fits in 3 bytes/char or less.
+# This class provides a normalize_string method that guarantees that its result is in valid UTF-8
+# format for Ruby and all versions of MySQL (using mb3 storage).
+#
+# [Deprecated] Equivalently, you can also create an instance of this class and call to_string or to_s on it.
 module Invoca
   module Utils
     class GuaranteedUTF8String
-      def initialize(string)
-        if string.is_a?(String) ||
-          (string.respond_to?(:to_s) &&
-           string.method(:to_s).owner != Kernel)  # the lame .to_s from Kernel just calls .inspect :(
-          @string = string.to_s
-        else
-          raise ArgumentError, "#{self.class} must be initialized with a string or an object with a non-Kernel .to_s method but instead was #{string.class} #{string.inspect}"
-        end
-      end
+      attr_reader :to_string
-      def to_string
-        @to_string ||= normalize_string(@string)
+      def initialize(string)
+        @to_string = self.class.normalize_string(string)
       end
-      alias_method :to_s, :to_string
+      alias to_s to_string
       private
       # chosen because this is a 1-byte ASCII character that is not used in any of the popular escaping systems: XML, HTML, HTTP URIs, HTTP Form Post, JSON
-      REPLACE_CHARACTER = '~' unless defined?(REPLACE_CHARACTER)
+      REPLACE_CHARACTER = '~'
-      def normalize_string(str)
-        str = @string.dup
-        str.force_encoding('UTF-8')
-        if !str.valid_encoding?
-          cp1252_to_utf_8(str)
+      class << self
+        def normalize_string(orig_string,
+                             normalize_utf16:              true,
+                             normalize_cp1252:             true,
+                             normalize_newlines:           true,
+                             remove_utf8_bom:              true,
+                             replace_unicode_beyond_ffff:  true)
+          string =  if orig_string.is_a?(String) ||
+                      (orig_string.respond_to?(:to_s) &&
+                        orig_string.method(:to_s).owner != Kernel) # the lame .to_s from Kernel just calls .inspect :(
+                      orig_string.to_s.dup
+                    else
+                      raise ArgumentError, "must be passed a string or an object with a non-Kernel .to_s method but instead was #{orig_string.class} #{orig_string.inspect}"
+                    end
+          string.force_encoding('UTF-8')
+          normalize_string_from_utf8(string,
+                                     normalize_utf16: normalize_utf16,
+                                     normalize_cp1252: normalize_cp1252,
+                                     normalize_newlines: normalize_newlines,
+                                     remove_utf8_bom: remove_utf8_bom,
+                                     replace_unicode_beyond_ffff: replace_unicode_beyond_ffff)
         end
-        normalize_newlines(str)
-        remove_bom(str)
-        replace_unicode_beyond_ffff(str)
-        str
-      end
-      def normalize_newlines(str)
-        str.gsub!(/ \r\n | \r | \n /x, "\n")
-      end
+        private
-      def cp1252_to_utf_8(str)
-        str.force_encoding('CP1252')
-        str.encode!(
-          'UTF-8',
-          replace: REPLACE_CHARACTER,
-          undef: :replace,
-          invalid: :replace
-        )
-      end
+        def normalize_string_from_utf8(string,
+                                       normalize_utf16:,
+                                       normalize_cp1252:,
+                                       normalize_newlines:,
+                                       remove_utf8_bom:,
+                                       replace_unicode_beyond_ffff:)
+          found_utf_16 = normalize_utf_16(string, normalize_cp1252: normalize_cp1252) if normalize_utf16
+          if found_utf_16
+            string.encode!('UTF-8')
+          else
+            unless string.valid_encoding?
+              if normalize_cp1252
+                cp1252_to_utf_8(string)
+              else
+                raise ArgumentError, 'Could not normalize to utf8 due to invalid characters (probably CP1252)'
+              end
+            end
+          end
+          normalize_newlines(string)           if normalize_newlines
+          remove_utf8_bom(string)              if remove_utf8_bom
+          replace_unicode_beyond_ffff(string)  if replace_unicode_beyond_ffff
+          string
+        end
-      def remove_bom(str)
-        str.sub!(/\A \xEF\xBB\xBF/x, '')
-      end
+        UTF_16_LE_BOM = "\xFF\xFE"
+        UTF_16_BE_BOM = "\xFE\xFF"
+        UTF_8_BOM     = "\xEF\xBB\xBF"
+        PRIVATE_CP1252_CHAR_PATTERN = "[\u0080-\u009f]"
+        PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16LE'))
+        PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE = Regexp.new(PRIVATE_CP1252_CHAR_PATTERN.encode('UTF-16BE'))
+        # returns truthy iff UTF_16 was found, in which case it has been normalized but the string is still UTF-16
+        # otherwise returns falsey and leaves the string as is
+        def normalize_utf_16(string, normalize_cp1252:)
+          case string[0, 2]
+          when UTF_16_LE_BOM
+            string.slice!(0, 2)                 # remove the BOM
+            string.force_encoding('UTF-16LE')
+            normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16LE, 'UTF-16LE') if normalize_cp1252
+            true
+          when UTF_16_BE_BOM
+            string.slice!(0, 2)                 # remove the BOM
+            string.force_encoding('UTF-16BE')
+            normalize_multibyte_cp1252(string, PRIVATE_CP1252_CHAR_PATTERN_UTF_16BE, 'UTF-16BE') if normalize_cp1252
+            true
+          end
+        end
+        def normalize_multibyte_cp1252(string, pattern, encoding)
+          string.gsub!(pattern) { |c| c.ord.chr.force_encoding('CP1252').encode('UTF-8').encode(encoding) }
+        end
-      # Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
-      # is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
-      # it would take a data migration and didn't seem that important.
+        def normalize_newlines(string)
+          string.gsub!(/ \r\n | \r | \n /x, "\n")
+        end
+        def cp1252_to_utf_8(string)
+          string.force_encoding('CP1252')
+          string.encode!(
+            'UTF-8',
+            replace:  REPLACE_CHARACTER,
+            undef:    :replace,
+            invalid:  :replace
+          )
+        end
-      def replace_unicode_beyond_ffff(str)
-        str.gsub!(/[^\u{0}-\u{ffff}]/x, REPLACE_CHARACTER)
+        def remove_utf8_bom(string)
+          string.sub!(/\A #{UTF_8_BOM}/x, '')
+        end
+        # Note MySQL can only store Unicode up to code point U+FFFF in the standard mb3 storage type. There is an option to use mb4 which
+        # is needed to hold the code points above that (including emoji) but we haven't enabled that on any columns yet since
+        # it would take a data migration and didn't seem that important.
+        def replace_unicode_beyond_ffff(string)
+          string.gsub!(/[^\u0000-\uffff]/x, REPLACE_CHARACTER)
+        end
       end
     end
   end
 end

data/lib/invoca/utils/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module Invoca
   module Utils
-    VERSION = "0.0.3"
+    VERSION = "0.0.4"
   end
 end

data/test/helpers/constant_overrides.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+# The same as https://github.com/Invoca/test_overrides/blob/master/lib/constant_overrides.rb,
+# but less coupled to ExceptionHandling and Invoca microservices.
+module ConstantOverrides
+  def setup_constant_overrides
+    @constant_overrides = []
+  end
+  def cleanup_constant_overrides
+    @constant_overrides.reverse.each do |parent_module, k, v|
+      silence_warnings do
+        if v == :never_defined
+          parent_module.send(:remove_const, k)
+        else
+          parent_module.const_set(k, v)
+        end
+      end
+    end
+  end
+  def set_test_const(const_name, value)
+    const_name.is_a?(Symbol) and (const_name = const_name.to_s)
+    const_name.is_a?(String) or raise "Pass the constant name, not its value!"
+    final_parent_module = final_const_name = nil
+    original_value      =
+      const_name.split('::').reduce(Object) do |parent_module, nested_const_name|
+        parent_module == :never_defined and raise "You need to set each parent constant earlier! #{nested_const_name}"
+        final_parent_module = parent_module
+        final_const_name    = nested_const_name
+        begin
+          parent_module.const_get(nested_const_name)
+        rescue
+          :never_defined
+        end
+      end
+    @constant_overrides << [final_parent_module, final_const_name, original_value]
+    silence_warnings { final_parent_module.const_set(final_const_name, value) }
+  end
+end

data/test/test_helper.rb CHANGED Viewed

@@ -3,3 +3,6 @@ require "minitest/autorun"
 require 'rr'
 require 'shoulda'
 require 'pry'
+require 'active_support/all'
+require 'invoca/utils'

data/test/unit/guaranteed_utf8_string_test.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 # frozen_string_literal: true
+require_relative '../../lib/invoca/utils/guaranteed_utf8_string'
 require_relative '../test_helper'
 class GuaranteedUTF8StringTest < Minitest::Test
@@ -18,99 +19,222 @@ class GuaranteedUTF8StringTest < Minitest::Test
     end
   end
-  should "raise an error if initialized with an object with no to_s method" do
-    assert_raises ArgumentError, /GuaranteedUTF8String must be initialized with a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/ do
-      Invoca::Utils::GuaranteedUTF8String.new(HasNoTo_sMethod.new)
-    end
-  end
+  context Invoca::Utils::GuaranteedUTF8String do
+    context '.normalize_string' do
+      should 'raise an error if called with an object with no to_s method' do
+        ex = assert_raises ArgumentError do
+          Invoca::Utils::GuaranteedUTF8String.normalize_string(HasNoTo_sMethod.new)
+        end
-  should "raise an error if initialized with a basic Ruby object" do
-    assert_raises ArgumentError, /GuaranteedUTF8String must be initialized with a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/ do
-      Invoca::Utils::GuaranteedUTF8String.new(BasicObjectWithKernelMethods.new)
-    end
-  end
+        assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::HasNoTo_sMethod/, ex.message)
+      end
-  should "convert to a string with to_s if possible" do
-    result = Invoca::Utils::GuaranteedUTF8String.new(ConvertibleToString.new("test string"))
-    assert_equal "test string", result.to_string
-  end
+      should 'raise an error if called with a basic Ruby object' do
+        ex = assert_raises ArgumentError do
+          Invoca::Utils::GuaranteedUTF8String.normalize_string(BasicObjectWithKernelMethods.new)
+        end
-  context "#to_string" do
-    should "not mutate the original string" do
-      ascii_string = "new string".encode("ASCII")
-      utf8_string_instance = Invoca::Utils::GuaranteedUTF8String.new(ascii_string)
-      encoded_string = utf8_string_instance.to_string
+        assert_match(/must be passed a string or an object with a non-Kernel \.to_s method but instead was GuaranteedUTF8StringTest::BasicObjectWithKernelMethods/, ex.message)
+      end
-      assert_equal ascii_string, encoded_string
-      assert_equal Encoding::ASCII, ascii_string.encoding
-      assert_equal Encoding::UTF_8, encoded_string.encoding
-    end
+      should 'not mutate the original string' do
+        ascii_string = 'new string'.encode('ASCII')
+        encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(ascii_string)
-    should "return UTF-8 encoded string" do
-      original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
+        assert_equal ascii_string, encoded_string
+        assert_equal Encoding::ASCII, ascii_string.encoding
+        assert_equal Encoding::UTF_8, encoded_string.encoding
+      end
-      encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_string
+      should 'return UTF-8 encoded string' do
+        original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
-      assert_equal original_string, encoded_string
-      assert_equal Encoding::UTF_8, encoded_string.encoding
-    end
+        encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string)
-    should "return UTF-8 encoded string without BOM" do
-      original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
+        assert_equal original_string, encoded_string
+        assert_equal Encoding::UTF_8, encoded_string.encoding
+      end
-      encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_string
+      context "normalize_utf16" do
+        UTF16_LE_BOM = "\xFF\xFE"
+        UTF16_BE_BOM = "\xFE\xFF"
+        UTF16_LE_TEST_STRING = (UTF16_LE_BOM + "v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY').freeze
+        UTF16_BE_TEST_STRING = (UTF16_BE_BOM + "\x00v\x00a\x00l\x00i\x00d\x00,\x00u\x00t\x00f\x00-\x001\x006\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY').freeze
-      assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
-      assert_equal Encoding::UTF_8, encoded_string.encoding
-    end
+        should 'accept UTF-16LE in BINARY and return UTF-8 encoded string when true' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: true)
-    should "return UTF-8 encoded string using to_s alias" do
-      original_string = "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
+          assert_equal "valid,utf-16\nsecond", encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
-      encoded_string = Invoca::Utils::GuaranteedUTF8String.new(original_string).to_s
+        should 'not check for UTF-16LE in BINARY and return UTF-8 encoded string when false' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_LE_TEST_STRING, normalize_utf16: false)
+          expected = "ÿþv\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d\u0000"
+          assert_equal expected, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
-      assert_equal original_string, encoded_string
-      assert_equal Encoding::UTF_8, encoded_string.encoding
-    end
+        should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when true' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: true)
-    should "return UTF-8 encoded string after falling back to CP1252 encoding" do
-      string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five\xF6,six,seven,eight\n"
-      expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~fiveö,six,seven,eight\n"
+          assert_equal "valid,utf-16\nsecond", encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
-      encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
+        should 'not check for UTF-16BE in BINARY and return UTF-8 encoded string when false' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(UTF16_BE_TEST_STRING, normalize_utf16: false)
+          expected = "þÿ\u0000v\u0000a\u0000l\u0000i\u0000d\u0000,\u0000u\u0000t\u0000f\u0000-\u00001\u00006\u0000\n\u0000s\u0000e\u0000c\u0000o\u0000n\u0000d"
+          assert_equal expected, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
-      assert_equal expected_string, encoded_string
-      assert_equal Encoding::UTF_8, encoded_string.encoding
-    end
+        context "containing embedded CP1252" do
+          should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with "private" CP1252 when normalize_utf16: true, normalize_cp1252: false' do
+            original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
-    should "return UTF-8 encoded string with normalized return chars" do
-      string          = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
-      expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
+            encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
-      encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
+            assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
+            assert_equal Encoding::UTF_8, encoded_string.encoding
+          end
-      assert_equal expected_string, encoded_string
-      assert_equal Encoding::UTF_8, encoded_string.encoding
-    end
+          should 'accept UTF-16LE in BINARY and return UTF-8 encoded string with normalized CP1252 when normalize_utf16: true, normalize_cp1252: true' do
+            original_string = (UTF16_LE_BOM + "\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d\x00").force_encoding('BINARY')
+            encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true)
+            assert_equal "‘smart quotes’\nsecond", encoded_string
+            assert_equal Encoding::UTF_8, encoded_string.encoding
+          end
+          should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: false' do
+            original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
+            encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: false)
+            assert_equal "\u0091smart quotes\u0092\nsecond", encoded_string
+            assert_equal Encoding::UTF_8, encoded_string.encoding
+          end
+          should 'accept UTF-16BE in BINARY and return UTF-8 encoded string when normalize_utf16: true, normalize_cp1252: true' do
+            original_string = (UTF16_BE_BOM + "\x00\x91\x00s\x00m\x00a\x00r\x00t\x00 \x00q\x00u\x00o\x00t\x00e\x00s\x00\x92\x00\n\x00s\x00e\x00c\x00o\x00n\x00d").force_encoding('BINARY')
+            encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(original_string, normalize_utf16: true, normalize_cp1252: true)
+            assert_equal "‘smart quotes’\nsecond", encoded_string
+            assert_equal Encoding::UTF_8, encoded_string.encoding
+          end
+        end
+      end
+      context 'normalize_cp1252' do
+        setup do
+          @string = "This,is,NOT,a,valid,utf-8,csv,string\r\none,two,three,four,\x81five,\x91smart quotes\x92,\x93suck!\x94\n"
+        end
+        should 'raise ArgumentError when false' do
+          assert_raises(ArgumentError, /xxyy/) do
+            Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_cp1252: false)
+          end
+        end
+        should 'return UTF-8 encoded string after falling back to CP1252 encoding when true' do
+          expected_string = "This,is,NOT,a,valid,utf-8,csv,string\none,two,three,four,~five,‘smart quotes’,“suck!”\n"
-    should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
-      all_8_bit_characters = (1..255).map { |char| char.chr }.join
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string)
-      final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.new(all_8_bit_characters.dup).to_s
-      expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+          assert_equal expected_string, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
-      assert_equal expected_string, final_utf_8_string
+        should "encode all 255 UTF-8 characters, returning ~ when the character isn't mapped in CP1252" do
+          all_8_bit_characters = (1..255).map(&:chr).join
+          final_utf_8_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(all_8_bit_characters)
+          expected_string = "\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000A\u000B\u000C\u000A\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007F€~‚ƒ„…†‡ˆ‰Š‹Œ~Ž~~‘’“”•–—˜™š›œ~žŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
+          assert_equal expected_string, final_utf_8_string
+        end
+      end
+      context 'normalize_newlines' do
+        setup do
+          @string = "This string\n\n\n has line feeds\ncarriage\r\r returns\rand Windows\r\n\r\n new line chars\r\nend of \n\r\r\r\nstring"
+        end
+        should 'return UTF-8 encoded string without normalized return chars when false' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: false)
+          assert_equal @string, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
+        should 'return UTF-8 encoded string with normalized return chars when true' do
+          expected_string = "This string\n\n\n has line feeds\ncarriage\n\n returns\nand Windows\n\n new line chars\nend of \n\n\n\nstring"
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, normalize_newlines: true)
+          assert_equal expected_string, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
+      end
+      context 'remove_utf8_bom' do
+        setup do
+          @original_string = "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n"
+        end
+        should 'return UTF-8 encoded string with BOM intact when false' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: false)
+          assert_equal "\xEF\xBB\xBFthis,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
+        should 'return UTF-8 encoded string without BOM when true' do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@original_string, remove_utf8_bom: true)
+          assert_equal "this,is,a,valid,utf-8,csv,string\none,two,three,four,five,six,seven\n", encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
+      end
+      context 'replace_unicode_beyond_ffff' do
+        setup do
+          @string = "This string has some ✓ valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
+        end
+        should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when false" do
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: false)
+          assert_equal @string, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
+        should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~ when true" do
+          expected_string = 'This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF'
+          encoded_string = Invoca::Utils::GuaranteedUTF8String.normalize_string(@string, replace_unicode_beyond_ffff: true)
+          assert_equal expected_string, encoded_string
+          assert_equal Encoding::UTF_8, encoded_string.encoding
+        end
+      end
     end
-    should "consider UTF-8 code points that take > 3 bytes (above U+FFFF) to be invalid (since MySQL can't store them unless column is declared mb4) and encode them as ~" do
-      string          = "This string has some ✓ valid UTF-8 but also some 😹 emoji \xf0\x9f\x98\xb9 that are > U+FFFF"
-      expected_string = "This string has some ✓ valid UTF-8 but also some ~ emoji ~ that are > U+FFFF"
+    context 'constructor' do
+      should 'call normalize_string with the default conversions' do
+        mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
-      encoded_string = Invoca::Utils::GuaranteedUTF8String.new(string).to_string
+        Invoca::Utils::GuaranteedUTF8String.new('').to_string
+      end
-      assert_equal expected_string, encoded_string
-      assert_equal Encoding::UTF_8, encoded_string.encoding
+      should 'do the same when using to_s alias' do
+        mock(Invoca::Utils::GuaranteedUTF8String).normalize_string('')
+        Invoca::Utils::GuaranteedUTF8String.new('').to_s
+      end
     end
   end
 end