RubyGems - marc - Versions diffs - 0.7.1 → 0.8.0 - Mend

marc 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/README.md +7 -18
data/Rakefile +3 -4
data/lib/marc/marc8/map_to_unicode.rb +16458 -0
data/lib/marc/marc8/to_unicode.rb +198 -0
data/lib/marc/reader.rb +133 -112
data/lib/marc/version.rb +1 -1
data/test/bad_eacc_encoding.marc8.marc +1 -0
data/test/bib178448.okay.human +24 -0
data/test/bib178448.okay.marc +1 -0
data/test/bib178448.writtenout.marc +1 -0
data/test/escaped_character_reference.marc8.marc +1 -0
data/test/marc8/data/test_marc8.txt +1514 -0
data/test/marc8/data/test_utf8.txt +1514 -0
data/test/marc8/tc_marc8_mapping.rb +11 -0
data/test/marc8/tc_to_unicode.rb +154 -0
data/test/marc_with_bad_utf8.utf8.human +40 -0
data/test/marc_with_bad_utf8.utf8.marc +1 -0
data/test/tc_reader_char_encodings.rb +92 -5
metadata +61 -15
data/test/tc_weird_jruby_bytes.rb +0 -62

data/lib/marc/marc8/to_unicode.rb ADDED Viewed

@@ -0,0 +1,198 @@
+# encoding: UTF-8
+require 'marc'
+require 'marc/marc8/map_to_unicode'
+require 'unf/normalizer'
+module MARC
+  module Marc8
+    # Class to convert Marc8 to UTF-8. NOTE: Requires ruby 1.9+ (this could be
+    # changed without too much trouble, but we just don't care to support 1.8.7 anymore.)
+    #
+    # http://www.loc.gov/marc/specifications/speccharmarc8.html
+    #
+    # NOT thread-safe, it needs to keep state as it goes through a string,
+    # do not re-use between threads.
+    #
+    # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
+    #
+    # Returns UTF-8 encoded string! Encode to something else if you want
+    # something else.
+    #
+    # III proprietary code points?
+    class ToUnicode
+      BASIC_LATIN = 0x42
+      ANSEL = 0x45
+      G0_SET = ['(', ',', '$']
+      G1_SET = [')', '-', '$']
+      CODESETS = MARC::Marc8::MapToUnicode::CODESETS
+      # These are state flags, MARC8 requires you to keep
+      # track of 'current char sets' or something like that, which
+      # are changed with escape codes, or something like that.
+      attr_accessor :g0, :g1
+      def initialize
+        self.g0 = BASIC_LATIN
+        self.g1 = ANSEL
+      end
+      # Returns UTF-8 encoded string equivalent of marc8_string passed in.
+      #
+      # Bad Marc8 bytes?  By default will raise an Encoding::InvalidByteSequenceError
+      # (will not have full metadata filled out, but will have a decent error message)
+      #
+      # Set option :invalid => :replace to instead silently replace bad bytes
+      # with a replacement char -- by default Unicode Replacement Char, but can set
+      # option :replace to something else, including empty string.
+      #
+      # converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
+      #
+      # By default returns NFC normalized, but set :normalization option to:
+      #    :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
+      #    we won't do any normalization just take it as it comes out of the
+      #    transcode algorithm. This will generally NOT be composed.
+      #
+      # By default, escaped unicode 'named character references' in Marc8 will
+      # be translated to actual UTF8. Eg. "&#x200F;" But pass :expand_ncr => false
+      # to disable. http://www.loc.gov/marc/specifications/speccharconversion.html#lossless
+      #
+      # String arg passed in WILL have it's encoding tagged 'binary' if
+      # it's not already, if it's Marc8 there's no good reason for it not to
+      # be already.
+      def transcode(marc8_string, options = {})
+        invalid_replacement     = options.fetch(:replace, "\uFFFD")
+        expand_ncr              = options.fetch(:expand_ncr, true)
+        normalization           = options.fetch(:normalization, :nfc)
+        # don't choke on empty marc8_string
+        return "" if marc8_string.nil? || marc8_string.empty?
+        # Make sure to call it 'binary', so we can slice it
+        # byte by byte, and so ruby doesn't complain about bad
+        # bytes for some other encoding. Yeah, we're changing
+        # encoding on input! If it's Marc8, it ought to be tagged
+        # binary already.
+        marc8_string.force_encoding("binary")
+        uni_list = []
+        combinings = []
+        pos = 0
+        while pos < marc8_string.length
+            if marc8_string[pos] == "\x1b"
+                next_byte = marc8_string[pos+1]
+                if G0_SET.include? next_byte
+                    if marc8_string.length >= pos + 3
+                        if marc8_string[pos+2] == ',' and next_byte == '$'
+                            pos += 1
+                        end
+                        self.g0 = marc8_string[pos+2].ord
+                        pos = pos + 3
+                        next
+                    else
+                        # if there aren't enough remaining characters, readd
+                        # the escape character so it doesn't get lost; may
+                        # help users diagnose problem records
+                        uni_list.push marc8_string[pos]
+                        pos += 1
+                        next
+                    end
+                elsif G1_SET.include? next_byte
+                    if marc8_string[pos+2] == '-' and next_byte == '$'
+                        pos += 1
+                    end
+                    self.g1 = marc8_string[pos+2].ord
+                    pos = pos + 3
+                    next
+                else
+                    charset = next_byte.ord
+                    if CODESETS.has_key? charset
+                        self.g0 = charset
+                        pos += 2
+                    elsif charset == 0x73
+                        self.g0 = BASIC_LATIN
+                        pos += 2
+                        if pos == marc8_string.length
+                            break
+                        end
+                    end
+                end
+            end
+            mb_flag = is_multibyte(self.g0)
+            if mb_flag
+                code_point = (marc8_string[pos].ord * 65536 +
+                     marc8_string[pos+1].ord * 256 +
+                     marc8_string[pos+2].ord)
+                pos += 3
+            else
+                code_point = marc8_string[pos].ord
+                pos += 1
+            end
+            if (code_point < 0x20 or
+                (code_point > 0x80 and code_point < 0xa0))
+                uni = unichr(code_point)
+                next
+            end
+            begin
+              code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
+              (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
+              if cflag
+                  combinings.push unichr(uni)
+              else
+                  uni_list.push unichr(uni)
+                  if combinings.length > 0
+                      uni_list.concat combinings
+                      combinings = []
+                  end
+              end
+            rescue KeyError
+              if options[:invalid] == :replace
+                # Let's coallesece multiple replacements
+                uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
+                pos += 1
+              else
+                raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
+              end
+            end
+        end
+        # what to do if combining chars left over?
+        uni_str = uni_list.join('')
+        if expand_ncr
+          uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
+            [$1.hex].pack("U")
+          end
+        end
+        if normalization
+          uni_str = UNF::Normalizer.normalize(uni_str, normalization)
+        end
+        return uni_str
+      end
+      # from the original python, yeah, apparently
+      # only one charset is considered multibyte
+      def is_multibyte(charset)
+        charset == 0x31
+      end
+      # input single unicode codepoint as integer; output encoded as a UTF-8 string
+      # python has unichr built-in, we just define it for convenience no problem.
+      def unichr(code_point)
+        [code_point].pack("U")
+      end
+    end
+  end
+end

data/lib/marc/reader.rb CHANGED Viewed

@@ -1,35 +1,48 @@
+require 'ensure_valid_encoding'
+require 'marc/marc8/to_unicode'
 module MARC
   # A class for reading MARC binary (ISO 2709) files.
   #
   # == Character Encoding
   #
-  # In ruby 1.8, if you mess up your character encodings, you may get
-  # garbage bytes. MARC::Reader takes no special action to determine or
-  # correct character encodings in ruby 1.8.
-  #
-  # In ruby 1.9, if character encodings get confused, you will likely get an
-  # exception raised at some point, either from inside MARC::Reader or in your
-  # own code. If your marc records are not in UTF-8, you will have to make sure
-  # MARC::Reader knows what character encoding to expect. For UTF-8, normally
-  # it will just work.
-  #
-  # Note that if your source data includes invalid illegal characters
-  # for it's encoding, while it _may_ not cause MARC::Reader to raise an
-  # exception, it will likely result in an exception at a later point in
-  # your own code. You can ask MARC::Reader to remove invalid bytes from data,
-  # see :invalid and :replace options below.
-  #
-  # In ruby 1.9, it's important strings are tagged with their proper encoding.
-  # **MARC::Reader does _not_ at present look inside the MARC file to see what
-  # encoding it claims for itself** -- real world MARC records are so unreliable
-  # here as to limit utility; and we have international users and international
-  # MARC uses several conventions for this. Instead, MARC::Reader uses ordinary
-  # ruby conventions.  If your data is in UTF-8, it'll probably Just Work,
-  # otherwise you simply have to tell MARC::Reader what the source encoding is:
-  #
-  #     Encoding.default_external # => usually "UTF-8" for most people
-  #     # marc data will be considered UTF-8, as per Encoding.default_external
-  #     MARC::Reader.new("path/to/file.marc")
+  # In ruby 1.9+, ruby tags all strings with expected character encodings.
+  # If illegal bytes for that character encoding are encountered in certain
+  # operations, ruby will raise an exception. If a String is incorrectly
+  # tagged with the wrong character encoding, that makes it fairly likely
+  # an illegal byte for the specified encoding will be encountered.
+  #
+  # So when reading binary MARC data with the MARC::Reader, it's important
+  # that you let it know the expected encoding:
+  #
+  #     MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8")
+  #
+  # If you leave off 'external_encoding', it will use the ruby environment
+  # Encoding.default_external, which is usually UTF-8 but may depend on your
+  # environment.
+  #
+  # Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
+  # bytes. By default MARC::Reader will leave these in the produced Strings,
+  # which will probably raise an exception later in your program. Better
+  # to catch this early, and ask MARC::Reader to raise immediately on illegal
+  # bytes:
+  #
+  #     MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
+  #       :validate_encoding => true)
+  #
+  # Alternately, you can have MARC::Reader replace illegal bytes
+  # with the Unicode Replacement Character, or with a string
+  # of your choice (including the empty string, meaning just omit the bad bytes)
+  #
+  #     MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
+  #        :invalid => :replace)
+  #     MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
+  #        :invalid => :replace, :replace => "")
+  #
+  # If you supply an :external_encoding argument, MARC::Reader will
+  # always assume that encoding -- if you leave it off, MARC::Reader
+  # will use the encoding tagged on any input you pass in, such
+  # as Strings or File handles.
   #
   #     # marc data will have same encoding as string.encoding:
   #     MARC::Reader.decode( string )
@@ -44,17 +57,42 @@ module MARC
   #     # explicitly tell MARC::Reader the encoding
   #     MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
   #
-  #     # If you have Marc8 data, you _really_ want to convert it
-  #     # to UTF8 outside of ruby, but if you can't:
-  #     MARC::Reader.new("marc8.marc" :external_encoding => "binary")
-  #     # But you probably _will_ have problems subsequently in your own
-  #     # own code using the MARC::Record.
+  # === MARC-8
+  #
+  # The legacy MARC-8 encoding needs to be handled differently, because
+  # there is no built-in support in ruby for MARC-8.
+  #
+  # You _can_ specify "MARC-8" as an external encoding. It will trigger
+  # trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
+  #
+  #     MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
+  #
+  # For external_encoding "MARC-8", :validate_encoding is always true,
+  # there's no way to ignore bad bytes in MARC-8 when transcoding to
+  # unicode.  However, just as with other encodings, the
+  # `:invalid => :replace` and `:replace => "string"`
+  # options can be used to replace bad bytes instead of raising.
+  #
+  # If you want your MARC-8 to be transcoded internally to something
+  # other than UTF-8, you can use the :internal_encoding option
+  # which works with any encoding in MARC::Reader.
+  #
+  #     MARC::Reader.new("marc8.mrc",
+  #       :external_encoding => "MARC-8",
+  #       :internal_encoding => "UTF-16LE")
+  #
+  # If you want to read in MARC-8 without transcoding, leaving the
+  # internal Strings in MARC-8, the only way to do that is with
+  # ruby's 'binary' (aka "ASCII-8BIT") encoding, since ruby doesn't
+  # know from MARC-8. This will work:
+  #
+  #     MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
   #
-  # One way or another, you have to tell MARC::Reader what the external
-  # encoding is, if it's not the default for your system (usually UTF-8).
-  # It won't guess from internal MARC leader etc.
+  # Please note that MARC::Reader does _not_ currently have any facilities
+  # for guessing encoding from MARC21 leader byte 9, that is ignored.
+  #
+  # === Complete Encoding Options
   #
-  # == Additional Options
   # These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
   # to specify external encoding, ask for a transcode to a different
   # encoding on read, or validate or replace bad bytes in source.
@@ -83,7 +121,7 @@ module MARC
   #    your own replacement string for invalid bytes. You may use the
   #    empty string to simply eliminate invalid bytes.
   #
-  # == Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
+  # === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
   #
   # Be careful with using an explicit File object with the File's own
   # :internal_encoding set -- it can cause ruby to transcode your data
@@ -109,11 +147,14 @@ module MARC
   #    MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:binary:binary"),
   #       :external_encoding => "cp866",
   #       :internal_encoding => "utf-8")
-  # == jruby note
-  # Note all of our char encoding tests currently pass on jruby in ruby 1.9
-  # mode; if you are using binary MARC records in a non-UTF8 encoding, you may
-  # have trouble in jruby. We believe it's a jruby bug.
+  #
+  # === jruby note
+  # In the past, jruby encoding-related bugs have caused problems with
+  # our encoding treatments. See for example:
   # https://jira.codehaus.org/browse/JRUBY-6637
+  #
+  # We recommend using the latest version of jruby, especially
+  # at least jruby 1.7.6.
   class Reader
     include Enumerable
@@ -284,31 +325,10 @@ module MARC
         # remove end of field
         field_data.delete!(END_OF_FIELD)
-        if field_data.respond_to?(:force_encoding)
-          if params[:external_encoding]
-            field_data = field_data.force_encoding(params[:external_encoding])
-          end
-          # If we're transcoding anyway, pass our invalid/replace options
-          # on to String#encode, which will take care of them -- or raise
-          # with illegal bytes without :replace=>:invalid.
-          #
-          # If we're NOT transcoding, we need to use our own pure-ruby
-          # implementation to do invalid byte replacements. OR to raise
-          # a predicatable exception iff :validate_encoding, otherwise
-          # for performance we won't check, and you may or may not
-          # get an exception from inside ruby-marc, and it may change
-          # in future implementations.
-          if params[:internal_encoding]
-            field_data = field_data.encode(params[:internal_encoding], params)
-          elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
-            field_data = MARC::Reader.validate_encoding(field_data,  params)
-          end
-        end
         # add a control field or data field
         if MARC::ControlField.control_tag?(tag)
+          field_data = MARC::Reader.set_encoding( field_data , params)
           record.append(MARC::ControlField.new(tag,field_data))
         else
           field = MARC::DataField.new(tag)
@@ -321,12 +341,13 @@ module MARC
           next if subfields.length() < 2
           # get indicators
-          indicators = subfields.shift()
+          indicators = MARC::Reader.set_encoding( subfields.shift(), params)
           field.indicator1 = indicators[0,1]
           field.indicator2 = indicators[1,1]
           # add each subfield to the field
           subfields.each() do |data|
+            data = MARC::Reader.set_encoding( data, params )
             subfield = MARC::Subfield.new(data[0,1],data[1..-1])
             field.append(subfield)
           end
@@ -337,57 +358,57 @@ module MARC
       end
       return record
-    end
-    # Pass in a string, will raise an Encoding::InvalidByteSequenceError
-    # if it contains an invalid byte for it's encoding; otherwise
-    # returns an equivalent string. Surprisingly not built into
-    # ruby 1.9.3 (yet?). https://bugs.ruby-lang.org/issues/6321
-    #
-    # The InvalidByteSequenceError will NOT be filled out
-    # with the usual error metadata, sorry.
+    end
+    # input passed in probably has 'binary' encoding.
+    # We'll set it to the proper encoding, and depending on settings, optionally
+    # * check for valid encoding
+    #   * raise if not valid
+    #   * or replace bad bytes with replacement chars if not valid
+    # * transcode from external_encoding to internal_encoding
     #
-    # OR, like String#encode, pass in option `:invalid => :replace`
-    # to replace invalid bytes with a replacement string in the
-    # returned string.  Pass in the
-    # char you'd like with option `:replace`, or will, like String#encode
-    # use the unicode replacement char if it thinks it's a unicode encoding,
-    # else ascii '?'.
+    # Special case for encoding "MARC-8" -- will be transcoded to
+    # UTF-8 (then further transcoded to external_encoding, if set).
+    # For "MARC-8", validate_encoding is always true, there's no way to
+    # ignore bad bytes.
     #
-    # in any case, method will raise, or return a new string
-    # that is #valid_encoding?
-    def self.validate_encoding(str, options = {})
-      return str unless str.respond_to?(:encoding)
-      if str.valid_encoding?
-        return str
-      elsif options[:invalid] != :replace
-        # If we're not replacing, just raise right away without going through
-        # chars for performance.
-        #
-        # That does mean we're not able to say exactly what byte was bad though.
-        # And the exception isn't filled out with all it's usual attributes,
-        # which would be hard even we were going through all the chars/bytes.
-        raise  Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
-      else
-        # :replace => :invalid,
-        # actually need to go through chars to replace bad ones
-        return str.chars.collect do |c|
-          if c.valid_encoding?
-            c
+    # Params options:
+    #
+    #  * external_encoding: what encoding the input is expected to be in
+    #  * validate_encoding: if true, will raise if an invalid encoding
+    #  * invalid:  if set to :replace, will replace bad bytes with replacement
+    #              chars instead of raising.
+    #  * replace: Set replacement char for use with 'invalid', otherwise defaults
+    #             to unicode replacement char, or question mark.
+    def self.set_encoding(str, params)
+      if str.respond_to?(:force_encoding)
+        if params[:external_encoding]
+          if params[:external_encoding] == "MARC-8"
+            transcode_params = [:invalid, :replace].each_with_object({}) { |k, hash| hash[k] = params[k] if params.has_key?(k) }
+            str = MARC::Marc8::ToUnicode.new.transcode(str, transcode_params)
           else
-            options[:replace] || (
-             # surely there's a better way to tell if
-             # an encoding is a 'Unicode encoding form'
-             # than this? What's wrong with you ruby 1.9?
-             str.encoding.name.start_with?('UTF') ?
-                "\uFFFD" :
-                "?" )
+            str = str.force_encoding(params[:external_encoding])
           end
-        end.join
-      end
-    end
+        end
+        # If we're transcoding anyway, pass our invalid/replace options
+        # on to String#encode, which will take care of them -- or raise
+        # with illegal bytes without :replace=>:invalid.
+        #
+        # If we're NOT transcoding, we need to use our own pure-ruby
+        # implementation to do invalid byte replacements. OR to raise
+        # a predicatable exception iff :validate_encoding, otherwise
+        # for performance we won't check, and you may or may not
+        # get an exception from inside ruby-marc, and it may change
+        # in future implementations.
+        if params[:internal_encoding]
+          str = str.encode(params[:internal_encoding], params)
+        elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
+          str = EnsureValidEncoding.ensure_valid_encoding(str,  params)
+         end
+       end
+       return str
+    end
   end