RubyGems - marc - Versions diffs - 1.0.3 → 1.0.4 - Mend

marc 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/marc/marc8/to_unicode.rb +21 -21
data/lib/marc/version.rb +1 -1
data/test/marc8/tc_to_unicode.rb +33 -10
metadata +3 -4

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3c354c92026e4cf40c482ef72de120b7aaffac5cbc24e1c369a498bf4519a6ff
-  data.tar.gz: b6daa3e964746945d35cae76eb290cbc0b3dc2e5d03eb9d383e5e5aba473c676
+  metadata.gz: 474c3ee37225584b3e5f189ff5f49507b82741da82aef0dd68a7e39180d25874
+  data.tar.gz: f0d272c5171827dcfa327ae8d079ee0ada52ba8da5b981378f7c2352d16a7a0e
 SHA512:
-  metadata.gz: b49c9c88fb12854317d0f2fc14ce465f261a9ba2db3941f694e690f124063bd905522d9a21e862e777d84992e858839956bd289a275e731be60f638e2b327e89
-  data.tar.gz: 22be396099b50aa7dea829b10485e9f4dc0d7d9232470d1db05b56f95815f591f3eebdfdc44af90c6f8f212ddd982814acd5356de8cd9c88e194d54c93f9ba99
+  metadata.gz: 04361e464361334b874b737e292e58acae879c3d086a68d5292abb5b8ccd9c28370173c3a60ca6d5700ef8dbd941b607a888ef9c7c700623d4440f577423217d
+  data.tar.gz: 22162879382120991f8a76484c2de73ac5f2aa8daf163bb1de0977723102876e0c2f2a81b9e04d2f4c5beda189e4828edc4673c37bf8f02e3a4a1235a2abaf16

data/lib/marc/marc8/to_unicode.rb CHANGED

@@ -12,12 +12,12 @@ module MARC
     # http://www.loc.gov/marc/specifications/speccharmarc8.html
     #
     # NOT thread-safe, it needs to keep state as it goes through a string,
-    # do not re-use between threads.
+    # do not re-use between threads.
     #
-    # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
+    # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
     #
     # Returns UTF-8 encoded string! Encode to something else if you want
-    # something else.
+    # something else.
     #
     # III proprietary code points?
     class ToUnicode
@@ -31,7 +31,7 @@ module MARC
       # These are state flags, MARC8 requires you to keep
       # track of 'current char sets' or something like that, which
-      # are changed with escape codes, or something like that.
+      # are changed with escape codes, or something like that.
       attr_accessor :g0, :g1
       def initialize
@@ -39,21 +39,21 @@ module MARC
         self.g1 = ANSEL
       end
-      # Returns UTF-8 encoded string equivalent of marc8_string passed in.
+      # Returns UTF-8 encoded string equivalent of marc8_string passed in.
       #
       # Bad Marc8 bytes?  By default will raise an Encoding::InvalidByteSequenceError
       # (will not have full metadata filled out, but will have a decent error message)
       #
       # Set option :invalid => :replace to instead silently replace bad bytes
-      # with a replacement char -- by default Unicode Replacement Char, but can set
-      # option :replace to something else, including empty string.
+      # with a replacement char -- by default Unicode Replacement Char, but can set
+      # option :replace to something else, including empty string.
       #
       # converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
       #
       # By default returns NFC normalized, but set :normalization option to:
       #    :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
       #    we won't do any normalization just take it as it comes out of the
-      #    transcode algorithm. This will generally NOT be composed.
+      #    transcode algorithm. This will generally NOT be composed.
       #
       # By default, escaped unicode 'named character references' in Marc8 will
       # be translated to actual UTF8. Eg. "&#x200F;" But pass :expand_ncr => false
@@ -61,21 +61,21 @@ module MARC
       #
       # String arg passed in WILL have it's encoding tagged 'binary' if
       # it's not already, if it's Marc8 there's no good reason for it not to
-      # be already.
+      # be already.
       def transcode(marc8_string, options = {})
         invalid_replacement     = options.fetch(:replace, "\uFFFD")
         expand_ncr              = options.fetch(:expand_ncr, true)
         normalization           = options.fetch(:normalization, :nfc)
         # don't choke on empty marc8_string
         return "" if marc8_string.nil? || marc8_string.empty?
         # Make sure to call it 'binary', so we can slice it
         # byte by byte, and so ruby doesn't complain about bad
         # bytes for some other encoding. Yeah, we're changing
         # encoding on input! If it's Marc8, it ought to be tagged
-        # binary already.
+        # binary already.
         marc8_string.force_encoding("binary")
         uni_list = []
@@ -124,7 +124,7 @@ module MARC
             end
             mb_flag = is_multibyte(self.g0)
             if mb_flag
                 code_point = (marc8_string[pos].ord * 65536 +
                      marc8_string[pos+1].ord * 256 +
@@ -134,7 +134,7 @@ module MARC
                 code_point = marc8_string[pos].ord
                 pos += 1
             end
             if (code_point < 0x20 or
                 (code_point > 0x80 and code_point < 0xa0))
                 uni = unichr(code_point)
@@ -144,7 +144,7 @@ module MARC
             begin
               code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
               (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
               if cflag
                   combinings.push unichr(uni)
               else
@@ -160,16 +160,16 @@ module MARC
                 uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
                 pos += 1
               else
-                raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
+                raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
               end
             end
         end
         # what to do if combining chars left over?
         uni_str = uni_list.join('')
         if expand_ncr
-          uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
+          uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
             [$1.hex].pack("U")
           end
         end
@@ -177,7 +177,7 @@ module MARC
         if normalization
           uni_str = UNF::Normalizer.normalize(uni_str, normalization)
         end
         return uni_str
       end
@@ -188,11 +188,11 @@ module MARC
       end
       # input single unicode codepoint as integer; output encoded as a UTF-8 string
-      # python has unichr built-in, we just define it for convenience no problem.
+      # python has unichr built-in, we just define it for convenience no problem.
       def unichr(code_point)
         [code_point].pack("U")
       end
     end
   end
-end
+end

data/lib/marc/version.rb CHANGED

@@ -1,3 +1,3 @@
 module MARC
-  VERSION = "1.0.3"
+  VERSION = "1.0.4"
 end

data/test/marc8/tc_to_unicode.rb CHANGED

@@ -32,9 +32,9 @@ if "".respond_to?(:encoding)
     def test_lots_of_marc8_test_cases
       # Heap of test cases taken from pymarc, which provided these
-      # two data files, marc8 and utf8, with line-by-line correspondences.
+      # two data files, marc8 and utf8, with line-by-line correspondences.
       #
-      # For now, we have NOT included proprietary III encodings in our test data!
+      # For now, we have NOT included proprietary III encodings in our test data!
       utf8_file   = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
       marc8_file  = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
@@ -55,7 +55,7 @@ if "".respond_to?(:encoding)
           assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
         end
-      rescue EOFError => each
+      rescue EOFError => each
         # just means the file was over, no biggie
         assert i > 1500, "Read as many lines as we expected to, at least 1500"
       rescue Exception => e
@@ -82,27 +82,50 @@ if "".respond_to?(:encoding)
       assert_equal unicode_d,  converter.transcode(marc8, :normalization => :nfd)
       assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
-      # disable normalization for performance or something, we won't end up with NFC.
+      # disable normalization for performance or something, we won't end up with NFC.
       refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
     end
     def test_expand_ncr
       converter = MARC::Marc8::ToUnicode.new
       marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
       assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
       assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
-    end
+    end
     def test_bad_byte
       converter = MARC::Marc8::ToUnicode.new
       bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
       assert_raise(Encoding::InvalidByteSequenceError) {
-        value = converter.transcode(bad_marc8)
+        converter.transcode(bad_marc8)
       }
     end
+    def test_bad_byte_error_message
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      begin
+        converter.transcode(bad_marc8)
+      rescue Encoding::InvalidByteSequenceError => err
+        assert_equal("MARC8, input byte offset 30, code set: 0x31, code point: 0x7b3639, value: 米国の統治の仕組�", err.message)
+      end
+    end
+    def test_multiple_bad_byte_error_message
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      begin
+        converter.transcode(bad_marc8)
+      rescue Encoding::InvalidByteSequenceError => err
+        # It still identifies the first bad byte found in the offset info, but replaces all bad bytes in the error message
+        assert_equal("MARC8, input byte offset 21, code set: 0x31, code point: 0x7b3639, value: 統治の仕組� 米国の統治の仕組� 米国の統治の仕組�", err.message)
+      end
+    end
     def test_bad_byte_with_replacement
       converter = MARC::Marc8::ToUnicode.new
@@ -112,9 +135,9 @@ if "".respond_to?(:encoding)
       assert_equal "UTF-8", value.encoding.name
       assert value.valid_encoding?
-      assert value.include?("\uFFFD"), "includes replacement char"
+      assert value.include?("\uFFFD"), "includes replacement char"
       # coalescing multiple replacement chars at end, could change
-      # to not do so, important thing is at least one is there.
+      # to not do so, important thing is at least one is there.
       assert_equal "米国の統治の仕組�", value
     end
@@ -150,5 +173,5 @@ if "".respond_to?(:encoding)
   end
 else
   require 'pathname'
-  $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
+  $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: marc
 version: !ruby/object:Gem::Version
-  version: 1.0.3
+  version: 1.0.4
 platform: ruby
 authors:
 - Kevin Clarke
@@ -13,7 +13,7 @@ authors:
 autorequire: marc
 bindir: bin
 cert_chain: []
-date: 2019-03-27 00:00:00.000000000 Z
+date: 2019-06-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: scrub_rb
@@ -131,8 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.6
+rubygems_version: 3.0.3
 signing_key:
 specification_version: 4
 summary: A ruby library for working with Machine Readable Cataloging