RubyGems - marc - Versions diffs - 1.1.1 → 1.3.0 - Mend

marc 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +4 -4
data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
data/.github/workflows/ruby.yml +24 -0
data/.gitignore +17 -0
data/.standard.yml +1 -0
data/{Changes → CHANGELOG.md} +116 -30
data/Gemfile +5 -0
data/README.md +239 -46
data/Rakefile +14 -14
data/bin/marc +14 -0
data/bin/marc2xml +17 -0
data/examples/xml2marc.rb +10 -0
data/lib/marc/constants.rb +3 -3
data/lib/marc/controlfield.rb +35 -23
data/lib/marc/datafield.rb +70 -63
data/lib/marc/dublincore.rb +59 -41
data/lib/marc/exception.rb +9 -1
data/lib/marc/jsonl_reader.rb +33 -0
data/lib/marc/jsonl_writer.rb +44 -0
data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
data/lib/marc/marc8/to_unicode.rb +80 -87
data/lib/marc/reader.rb +116 -124
data/lib/marc/record.rb +72 -62
data/lib/marc/subfield.rb +12 -10
data/lib/marc/unsafe_xmlwriter.rb +93 -0
data/lib/marc/version.rb +1 -1
data/lib/marc/writer.rb +27 -30
data/lib/marc/xml_parsers.rb +222 -197
data/lib/marc/xmlreader.rb +131 -114
data/lib/marc/xmlwriter.rb +93 -82
data/lib/marc.rb +20 -18
data/marc.gemspec +28 -0
data/test/marc8/tc_marc8_mapping.rb +3 -3
data/test/marc8/tc_to_unicode.rb +28 -34
data/test/messed_up_leader.xml +9 -0
data/test/tc_controlfield.rb +37 -34
data/test/tc_datafield.rb +65 -60
data/test/tc_dublincore.rb +9 -11
data/test/tc_hash.rb +10 -13
data/test/tc_jsonl.rb +19 -0
data/test/tc_marchash.rb +17 -21
data/test/tc_parsers.rb +108 -144
data/test/tc_reader.rb +35 -36
data/test/tc_reader_char_encodings.rb +149 -169
data/test/tc_record.rb +143 -148
data/test/tc_subfield.rb +14 -13
data/test/tc_unsafe_xml.rb +95 -0
data/test/tc_writer.rb +101 -108
data/test/tc_xml.rb +101 -94
data/test/tc_xml_error_handling.rb +7 -8
data/test/ts_marc.rb +8 -8
metadata +129 -22

data/lib/marc/reader.rb CHANGED Viewed

@@ -1,10 +1,8 @@
-require 'scrub_rb'
 # Note: requiring 'marc/marc8/to_unicode' below, in #initialize,
 # only when necessary
 module MARC
-  # A class for reading MARC binary (ISO 2709) files.
+  # A class for reading MARC binary (ISO 2709) files.
   #
   # == Character Encoding
   #
@@ -12,7 +10,7 @@ module MARC
   # If illegal bytes for that character encoding are encountered in certain
   # operations, ruby will raise an exception. If a String is incorrectly
   # tagged with the wrong character encoding, that makes it fairly likely
-  # an illegal byte for the specified encoding will be encountered.
+  # an illegal byte for the specified encoding will be encountered.
   #
   # So when reading binary MARC data with the MARC::Reader, it's important
   # that you let it know the expected encoding:
@@ -21,7 +19,7 @@ module MARC
   #
   # If you leave off 'external_encoding', it will use the ruby environment
   # Encoding.default_external, which is usually UTF-8 but may depend on your
-  # environment.
+  # environment.
   #
   # Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
   # bytes. By default MARC::Reader will leave these in the produced Strings,
@@ -29,58 +27,58 @@ module MARC
   # to catch this early, and ask MARC::Reader to raise immediately on illegal
   # bytes:
   #
-  #     MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
+  #     MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
   #       :validate_encoding => true)
   #
   # Alternately, you can have MARC::Reader replace illegal bytes
   # with the Unicode Replacement Character, or with a string
   # of your choice (including the empty string, meaning just omit the bad bytes)
   #
-  #     MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
+  #     MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
   #        :invalid => :replace)
-  #     MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
+  #     MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
   #        :invalid => :replace, :replace => "")
   #
   # If you supply an :external_encoding argument, MARC::Reader will
   # always assume that encoding -- if you leave it off, MARC::Reader
   # will use the encoding tagged on any input you pass in, such
-  # as Strings or File handles.
+  # as Strings or File handles.
   #
   #     # marc data will have same encoding as string.encoding:
   #     MARC::Reader.decode( string )
   #
   #     # Same, values will have encoding of string.encoding:
-  #     MARC::Reader.new(StringIO.new(string))
+  #     MARC::Reader.new(StringIO.new(string))
   #
   #     # data values will have cp866 encoding, per external_encoding of
   #     # File object passed in
   #     MARC::Reader.new(File.new("myfile.marc", "r:cp866"))
   #
   #     # explicitly tell MARC::Reader the encoding
-  #     MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
+  #     MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
   #
   # === MARC-8
   #
   # The legacy MARC-8 encoding needs to be handled differently, because
-  # there is no built-in support in ruby for MARC-8.
+  # there is no built-in support in ruby for MARC-8.
   #
   # You _can_ specify "MARC-8" as an external encoding. It will trigger
-  # trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
+  # trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
   #
   #     MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
   #
   # For external_encoding "MARC-8", :validate_encoding is always true,
   # there's no way to ignore bad bytes in MARC-8 when transcoding to
-  # unicode.  However, just as with other encodings, the
+  # unicode.  However, just as with other encodings, the
   # `:invalid => :replace` and `:replace => "string"`
-  # options can be used to replace bad bytes instead of raising.
+  # options can be used to replace bad bytes instead of raising.
   #
   # If you want your MARC-8 to be transcoded internally to something
   # other than UTF-8, you can use the :internal_encoding option
-  # which works with any encoding in MARC::Reader.
+  # which works with any encoding in MARC::Reader.
   #
-  #     MARC::Reader.new("marc8.mrc",
-  #       :external_encoding => "MARC-8",
+  #     MARC::Reader.new("marc8.mrc",
+  #       :external_encoding => "MARC-8",
   #       :internal_encoding => "UTF-16LE")
   #
   # If you want to read in MARC-8 without transcoding, leaving the
@@ -90,48 +88,48 @@ module MARC
   #
   #     MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
   #
-  # Please note that MARC::Reader does _not_ currently have any facilities
-  # for guessing encoding from MARC21 leader byte 9, that is ignored.
+  # Please note that MARC::Reader does _not_ currently have any facilities
+  # for guessing encoding from MARC21 leader byte 9, that is ignored.
   #
   # === Complete Encoding Options
   #
   # These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
   # to specify external encoding, ask for a transcode to a different
-  # encoding on read, or validate or replace bad bytes in source.
+  # encoding on read, or validate or replace bad bytes in source.
   #
   # [:external_encoding]
   #    What encoding to consider the MARC record's values to be in. This option
-  #    takes precedence over the File handle or String argument's encodings.
+  #    takes precedence over the File handle or String argument's encodings.
   # [:internal_encoding]
   #    Ask MARC::Reader to transcode to this encoding in memory after reading
-  #    the file in.
+  #    the file in.
   # [:validate_encoding]
   #    If you pass in `true`, MARC::Reader will promise to raise an Encoding::InvalidByteSequenceError
   #    if there are illegal bytes in the source for the :external_encoding. There is
   #    a performance penalty for this check. Without this option, an exception
-  #    _may_ or _may not_ be raised, and whether an exception or raised (or
+  #    _may_ or _may not_ be raised, and whether an exception or raised (or
   #    what class the exception has) may change in future ruby-marc versions
-  #    without warning.
+  #    without warning.
   # [:invalid]
   #    Just like String#encode, set to :replace and any bytes in source data
-  #    illegal for the source encoding will be replaced with the unicode
+  #    illegal for the source encoding will be replaced with the unicode
   #    replacement character (when in unicode encodings), or else '?'. Overrides
   #    :validate_encoding. This can help you sanitize your input and
-  #    avoid ruby "invalid UTF-8 byte" exceptions later.
+  #    avoid ruby "invalid UTF-8 byte" exceptions later.
   # [:replace]
   #    Just like String#encode, combine with `:invalid=>:replace`, set
   #    your own replacement string for invalid bytes. You may use the
-  #    empty string to simply eliminate invalid bytes.
+  #    empty string to simply eliminate invalid bytes.
   #
   # === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
   #
-  # Be careful with using an explicit File object with the File's own
-  # :internal_encoding set -- it can cause ruby to transcode your data
-  # _before_ MARC::Reader gets it, changing the bytecount and making the
+  # Be careful with using an explicit File object with the File's own
+  # :internal_encoding set -- it can cause ruby to transcode your data
+  # _before_ MARC::Reader gets it, changing the bytecount and making the
   # marc record unreadable in some cases. This
   # applies to Encoding.default_encoding too!
   #
-  #    # May in some cases result in unreadable marc and an exception
+  #    # May in some cases result in unreadable marc and an exception
   #    MARC::Reader.new(  File.new("marc_in_cp866.mrc", "r:cp866:utf-8") )
   #
   #    # May in some cases result in unreadable marc and an exception
@@ -156,7 +154,7 @@ module MARC
   # https://jira.codehaus.org/browse/JRUBY-6637
   #
   # We recommend using the latest version of jruby, especially
-  # at least jruby 1.7.6.
+  # at least jruby 1.7.6.
   class Reader
     include Enumerable
@@ -182,43 +180,42 @@ module MARC
     #
     # Also, if your data encoded with non ascii/utf-8 encoding
     # (for ex. when reading RUSMARC data) and you use ruby 1.9
-    # you can specify source data encoding with an option.
+    # you can specify source data encoding with an option.
     #
     #   reader = MARC::Reader.new('marc.dat', :external_encoding => 'cp866')
     #
     # or, you can pass IO, opened in the corresponding encoding
     #
     #   reader = MARC::Reader.new(File.new('marc.dat', 'r:cp866'))
-    def initialize(file, options = {})
+    def initialize(file, options = {})
       @encoding_options = {}
       # all can be nil
       [:internal_encoding, :external_encoding, :invalid, :replace, :validate_encoding].each do |key|
         @encoding_options[key] = options[key] if options.has_key?(key)
       end
-      if file.is_a?(String)
+      if file.is_a?(String)
         @handle = File.new(file)
-      elsif file.respond_to?("read", 5)
+      elsif file.respond_to?(:read, 5)
         @handle = file
       else
         raise ArgumentError, "must pass in path or file"
       end
-      if (! @encoding_options[:external_encoding] ) && @handle.respond_to?(:external_encoding)
+      if (!@encoding_options[:external_encoding]) && @handle.respond_to?(:external_encoding)
         # use file encoding only if we didn't already have an explicit one,
-        # explicit one takes precedence.
+        # explicit one takes precedence.
         #
         # Note, please don't use ruby's own internal_encoding transcode
         # with binary marc data, the transcode can mess up the byte count
-        # and make it unreadable.
+        # and make it unreadable.
         @encoding_options[:external_encoding] ||= @handle.external_encoding
       end
       # Only pull in the MARC8 translation if we need it, since it's really big
-      if @encoding_options[:external_encoding]  == "MARC-8"
-        require 'marc/marc8/to_unicode' unless defined? MARC::Marc8::ToUnicode
+      if @encoding_options[:external_encoding] == "MARC-8"
+        require "marc/marc8/to_unicode" unless defined? MARC::Marc8::ToUnicode
       end
     end
     # to support iteration:
@@ -226,13 +223,13 @@ module MARC
     #     print record
     #   end
     def each
-      unless block_given?
-        return self.enum_for(:each)
-      else
-        self.each_raw do |raw|
-          record = self.decode(raw)
+      if block_given?
+        each_raw do |raw|
+          record = decode(raw)
           yield record
         end
+      else
+        enum_for(:each)
       end
     end
@@ -257,10 +254,8 @@ module MARC
     #
     # If no block is given, an enumerator is returned
     def each_raw
-      unless block_given?
-        return self.enum_for(:each_raw)
-      else
-        while rec_length_s = @handle.read(5)
+      if block_given?
+        while (rec_length_s = @handle.read(5))
           # make sure the record length looks like an integer
           rec_length_i = rec_length_s.to_i
           if rec_length_i == 0
@@ -269,9 +264,11 @@ module MARC
           # get the raw MARC21 for a record back from the file
           # using the record length
-          raw = rec_length_s + @handle.read(rec_length_i-5)
+          raw = rec_length_s + @handle.read(rec_length_i - 5)
           yield raw
         end
+      else
+        enum_for(:each_raw)
       end
     end
@@ -280,7 +277,7 @@ module MARC
     # Wraps the class method MARC::Reader::decode, using the encoding options of
     # the MARC::Reader instance.
     def decode(marc)
-      return MARC::Reader.decode(marc, @encoding_options)
+      MARC::Reader.decode(marc, @encoding_options)
     end
     # A static method for turning raw MARC data in transission
@@ -288,34 +285,34 @@ module MARC
     # First argument is a String
     # options include:
     #   [:external_encoding]  encoding of MARC record data values
-    #   [:forgiving]          needs more docs, true is some kind of forgiving
-    #                         of certain kinds of bad MARC.
-    def self.decode(marc, params={})
+    #   [:forgiving]          needs more docs, true is some kind of forgiving
+    #                         of certain kinds of bad MARC.
+    def self.decode(marc, params = {})
       if params.has_key?(:encoding)
-        $stderr.puts "DEPRECATION WARNING: MARC::Reader.decode :encoding option deprecated, please use :external_encoding"
+        warn "DEPRECATION WARNING: MARC::Reader.decode :encoding option deprecated, please use :external_encoding"
         params[:external_encoding] = params.delete(:encoding)
       end
-      if (! params.has_key? :external_encoding ) && marc.respond_to?(:encoding)
+      if (!params.has_key? :external_encoding) && marc.respond_to?(:encoding)
         # If no forced external_encoding giving, respect the encoding
-        # declared on the string passed in.
+        # declared on the string passed in.
         params[:external_encoding] = marc.encoding
       end
       # And now that we've recorded the current encoding, we force
       # to binary encoding, because we're going to be doing byte arithmetic,
-      # and want to avoid byte-vs-char confusion.
+      # and want to avoid byte-vs-char confusion.
       marc.force_encoding("binary") if marc.respond_to?(:force_encoding)
-      record = Record.new()
-      record.leader = marc[0..LEADER_LENGTH-1]
+      record = Record.new
+      record.leader = marc[0..LEADER_LENGTH - 1]
       # where the field data starts
       base_address = record.leader[12..16].to_i
       # get the byte offsets from the record directory
-      directory = marc[LEADER_LENGTH..base_address-1]
+      directory = marc[LEADER_LENGTH..base_address - 1]
-      raise MARC::Exception.new("invalid directory in record") if directory == nil
+      raise MARC::Exception.new("invalid directory in record") if directory.nil?
       # the number of fields in the record corresponds to
       # how many directory entries there are
@@ -324,20 +321,19 @@ module MARC
       # when operating in forgiving mode we just split on end of
       # field instead of using calculated byte offsets from the
       # directory
-      if params[:forgiving]
+      if params[:forgiving]
         marc_field_data = marc[base_address..-1]
         # It won't let us do the split on bad utf8 data, but
         # we haven't yet set the 'proper' encoding or used
         # our correction/replace options. So call it binary for now.
         marc_field_data.force_encoding("binary") if marc_field_data.respond_to?(:force_encoding)
         all_fields = marc_field_data.split(END_OF_FIELD)
       else
-        mba =  marc.bytes.to_a
+        mba = marc.bytes.to_a
       end
-      0.upto(num_fields-1) do |field_num|
+      0.upto(num_fields - 1) do |field_num|
         # pull the directory entry for a field out
         entry_start = field_num * DIRECTORY_ENTRY_LENGTH
         entry_end = entry_start + DIRECTORY_ENTRY_LENGTH
@@ -350,12 +346,12 @@ module MARC
         # if we were told to be forgiving we just use the
         # next available chuck of field data that we
         # split apart based on the END_OF_FIELD
-        field_data = ''
+        field_data = ""
         if params[:forgiving]
-          field_data = all_fields.shift()
+          field_data = all_fields.shift
-        # otherwise we actually use the byte offsets in
-        # directory to figure out what field data to extract
+          # otherwise we actually use the byte offsets in
+          # directory to figure out what field data to extract
         else
           length = entry[3..6].to_i
           offset = entry[7..11].to_i
@@ -366,11 +362,11 @@ module MARC
         # remove end of field
         field_data.delete!(END_OF_FIELD)
         # add a control field or data field
         if MARC::ControlField.control_tag?(tag)
-          field_data = MARC::Reader.set_encoding( field_data , params)
-          record.append(MARC::ControlField.new(tag,field_data))
+          field_data = MARC::Reader.set_encoding(field_data, params)
+          record.append(MARC::ControlField.new(tag, field_data))
         else
           field = MARC::DataField.new(tag)
@@ -379,17 +375,17 @@ module MARC
           # must have at least 2 elements (indicators, and 1 subfield)
           # TODO some sort of logging?
-          next if subfields.length() < 2
+          next if subfields.length < 2
           # get indicators
-          indicators = MARC::Reader.set_encoding( subfields.shift(), params)
-          field.indicator1 = indicators[0,1]
-          field.indicator2 = indicators[1,1]
+          indicators = MARC::Reader.set_encoding(subfields.shift, params)
+          field.indicator1 = indicators[0, 1]
+          field.indicator2 = indicators[1, 1]
           # add each subfield to the field
-          subfields.each() do |data|
-            data = MARC::Reader.set_encoding( data, params )
-            subfield = MARC::Subfield.new(data[0,1],data[1..-1])
+          subfields.each do |data|
+            data = MARC::Reader.set_encoding(data, params)
+            subfield = MARC::Subfield.new(data[0, 1], data[1..-1])
             field.append(subfield)
           end
@@ -398,10 +394,12 @@ module MARC
         end
       end
-      return record
-    end
+      raise MARC::RecordException, record unless record.valid?
-    # input passed in probably has 'binary' encoding.
+      record
+    end
+    # input passed in probably has 'binary' encoding.
     # We'll set it to the proper encoding, and depending on settings, optionally
     # * check for valid encoding
     #   * raise if not valid
@@ -411,16 +409,16 @@ module MARC
     # Special case for encoding "MARC-8" -- will be transcoded to
     # UTF-8 (then further transcoded to external_encoding, if set).
     # For "MARC-8", validate_encoding is always true, there's no way to
-    # ignore bad bytes.
+    # ignore bad bytes.
     #
     # Params options:
-    #
-    #  * external_encoding: what encoding the input is expected to be in
+    #
+    #  * external_encoding: what encoding the input is expected to be in
     #  * validate_encoding: if true, will raise if an invalid encoding
     #  * invalid:  if set to :replace, will replace bad bytes with replacement
-    #              chars instead of raising.
+    #              chars instead of raising.
     #  * replace: Set replacement char for use with 'invalid', otherwise defaults
-    #             to unicode replacement char, or question mark.
+    #             to unicode replacement char, or question mark.
     def self.set_encoding(str, params)
       if str.respond_to?(:force_encoding)
         if params[:external_encoding]
@@ -430,41 +428,38 @@ module MARC
           else
             str = str.force_encoding(params[:external_encoding])
           end
-        end
+        end
         # If we're transcoding anyway, pass our invalid/replace options
         # on to String#encode, which will take care of them -- or raise
-        # with illegal bytes without :replace=>:invalid.
+        # with illegal bytes without :replace=>:invalid.
         #
         # If we're NOT transcoding, we need to use our own pure-ruby
         # implementation to do invalid byte replacements. OR to raise
         # a predicatable exception iff :validate_encoding, otherwise
         # for performance we won't check, and you may or may not
         # get an exception from inside ruby-marc, and it may change
-        # in future implementations.
+        # in future implementations.
         if params[:internal_encoding]
-          if RUBY_VERSION >= '3.0'
-            str = str.encode(params[:internal_encoding], **params)
+          str = if RUBY_VERSION >= "3.0"
+            str.encode(params[:internal_encoding], **params)
           else
-            str = str.encode(params[:internal_encoding], params)
+            str.encode(params[:internal_encoding], params)
           end
-        elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
+        elsif params[:invalid] || params[:replace] || (params[:validate_encoding] == true)
-          if params[:validate_encoding] == true && ! str.valid_encoding?
-            raise  Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
+          if params[:validate_encoding] == true && !str.valid_encoding?
+            raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
           end
           if params[:invalid] == :replace
             str = str.scrub(params[:replace])
           end
-         end
-       end
-       return str
-    end
-  end
+        end
+      end
+      str
+    end
+  end
   # Like Reader ForgivingReader lets you read in a batch of MARC21 records
   # but it does not use record lengths and field byte offets found in the
@@ -479,22 +474,19 @@ module MARC
   #
   # **NOTE**: ForgivingReader _may_ have unpredictable results when used
   # with marc records with char encoding other than system default (usually
-  # UTF8), _especially_ if you have Encoding.default_internal set.
+  # UTF8), _especially_ if you have Encoding.default_internal set.
   #
   # Implemented a sub-class of Reader over-riding #each, so we still
   # get DRY Reader's #initialize with proper char encoding options
-  # and handling.
+  # and handling.
   class ForgivingReader < Reader
     def each
       @handle.each_line(END_OF_RECORD) do |raw|
-        begin
-          record = MARC::Reader.decode(raw, @encoding_options.merge(:forgiving => true))
-          yield record
-        rescue StandardError => e
-          # caught exception just keep barrelling along
-          # TODO add logging
-        end
+        record = MARC::Reader.decode(raw, @encoding_options.merge(forgiving: true))
+        yield record
+      rescue
+        # caught exception just keep barrelling along
+        # TODO add logging
       end
     end
   end