RubyGems - marc - Versions diffs - 1.0.0 → 1.1.1 - Mend

marc 1.0.0 → 1.1.1

Files changed (15) hide show

checksums.yaml +5 -5
data/Changes +33 -18
data/README.md +13 -5
data/lib/marc/datafield.rb +1 -1
data/lib/marc/marc8/to_unicode.rb +21 -21
data/lib/marc/reader.rb +7 -3
data/lib/marc/record.rb +69 -53
data/lib/marc/version.rb +1 -1
data/lib/marc/xml_parsers.rb +9 -1
data/lib/marc/xmlwriter.rb +2 -1
data/test/marc8/tc_to_unicode.rb +33 -10
data/test/tc_xml.rb +7 -2
data/test/tc_xml_error_handling.rb +22 -0
data/test/three-records-second-bad.xml +160 -0
metadata +43 -29

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 91f847e4c58a60a123919914206e8b41935d1268
-  data.tar.gz: b9485bfa59528e3703cc5bc4b45ec10df6dea502
+SHA256:
+  metadata.gz: 53c1e338a00e1dbd7c09ed14edc916edd1211abe0edbabcf757b8b00a5aa209c
+  data.tar.gz: 80b4c48c2fc95887216194d264583302bacae6c606616a087c888668ba2bfb68
 SHA512:
-  metadata.gz: 7434cec1dd28cd9bdbf0390958d5f663f187720597c8bc5a0a45d04d6a84409bada2c6974e94b48effdc2e33fbb48fe54b3b86fb050a7e2bfa17da49bada3831
-  data.tar.gz: 83d5be3ce3eaa6da114101fe4dcbe15320b394184aeafa9aa20b8f7254c25d01f42f62d7153546909310adcb44d7f809c95f467c66f123435705b587957901b2
+  metadata.gz: b6dd17fa76ff33ef0da68946d29ec2079c495423559c47391cabf80b59393f0d481ef6b1467b839465d3225c1a52aec2d4945994986e8e956eccfc2ea73ada5b
+  data.tar.gz: a745e41aa2cbe87c70f9a2cbfe0de0dd12f47c167f867127353d86ff771ada14327dbce1ec461fcb81cacb3cf3f60f821cdb4c0087213f37a0ef42dc7ade78e8

data/Changes CHANGED Viewed

@@ -1,15 +1,31 @@
+v1.1.1 June 2021
+- Fix a regression when normalizing indicator values when serializing marcxml
+v1.1.0 June 2021
+ - Add support for additional valid subfield codes in marcxml
+v1.0.2 July 2017
+ - Now (correctly) throw an error if datafield string is the empty string
+   (thanks to @bibliotechy)
+v1.0.1 February 2016
+- Non-user-facing change in implementation of FieldMap strictly for performance
+v1.0.0 January 2015
+- Mostly changes that deal with encoding, plus the plunge to a 1.0 release
 v0.5.0 April 2012
-- Extensive rewrite of MARC::Reader (ISO 2709 binary reader) to provide a
-  fairly complete and consistent handing of char encoding issues in ruby 1.9.
+- Extensive rewrite of MARC::Reader (ISO 2709 binary reader) to provide a
+  fairly complete and consistent handing of char encoding issues in ruby 1.9.
   - This code is well covered by automated tests, but ends up complex, there
-    may be bugs, please report them.
-  - May not work properly under jruby with non-unicode source encodings.
-  - Still can't handle Marc8 encoding.
+    may be bugs, please report them.
+  - May not work properly under jruby with non-unicode source encodings.
+  - Still can't handle Marc8 encoding.
   - May not have entirely backwards compatible behavior with regard to char
-    encodings under ruby 1.9.x as previous 0.4.x versions. Test your code.
+    encodings under ruby 1.9.x as previous 0.4.x versions. Test your code.
     In particular, previous versions may have automatically _transcoded_
     non-unicode encodings to UTF-8 for you. This version will not do
-    so unless you ask it to with correct arguments.
+    so unless you ask it to with correct arguments.
 v0.4.4 Sat Mar 03 14:55:00 EDT 2012
 - Fixed performance regression: strict reader will parse about 5x faster now
@@ -26,8 +42,8 @@ v0.2.1 Mon Aug 18 14:14:16 EDT 2008
   Ross Singer)
 v0.2.0 Wed Jun 11 12:42:20 EDT 2008
-- added newline to output generated by REXML::Formatters::Default to make
-  it a bit more friendly. REXML::Formatters::Pretty and Transitive just
+- added newline to output generated by REXML::Formatters::Default to make
+  it a bit more friendly. REXML::Formatters::Pretty and Transitive just
   don't do what I want (whitespace in weird places).
 v0.1.9 Thu Jun  5 12:00:01 EDT 2008
@@ -36,7 +52,7 @@ v0.1.9 Thu Jun  5 12:00:01 EDT 2008
 v0.1.8 Tue Nov 13 22:51:03 EST 2007
 - added examples directory
-- fixed problem with leading whitespace and the leader in xml reader
+- fixed problem with leading whitespace and the leader in xml reader
   (thanks Morgan Cundiff)
 v0.1.7 Mon Nov 12 09:33:57 EST 2007
@@ -58,7 +74,7 @@ v0.1.4 Tue Jan  2 15:45:53 EST 2007
 - fixed bug in MARC::XMLWriter that was outputting all control field tags as 00z
   (thanks Ross Singer)
 - added :include_namespace option to MARC::XMLWriter::encode to include the
-  marcxml namespace, which allows MARC::Record::to_xml to emit the namespace
+  marcxml namespace, which allows MARC::Record::to_xml to emit the namespace
   for a single record.
 v0.1.3  Tue Jan  2 12:56:36 EST 2007
@@ -67,11 +83,11 @@ v0.1.3  Tue Jan  2 12:56:36 EST 2007
   as the hash keys.
 v0.1.2  Thu Dec 21 18:46:01 EST 2007
-- fixed MARC::Record::to_xml so that it actually is tested and works (thanks
+- fixed MARC::Record::to_xml so that it actually is tested and works (thanks
   Ross Singer)
 v0.1.1
-- added ability to pass File like objects to the constructor for
+- added ability to pass File like objects to the constructor for
   MARC::XMLReader like MARC::Reader (thanks Jake Glenn)
 v0.1.0  Wed Dec  6 15:40:40 EST 2006
@@ -93,11 +109,11 @@ v0.0.9  Tue Mar 28 10:02:16 CST 2006
 - added :stylesheet argument to XLMWriter.new
 v0.0.8  Mon Jan 16 22:31:00 EST 2006
-- removed control tests out of tc_field.rb into tc_control.rb
+- removed control tests out of tc_field.rb into tc_control.rb
 - fixed some formatting
 - changed control/field to controlfield/datafield
 - added == check for controlfield
-- removed namespace declarations on record elements in favor of default
+- removed namespace declarations on record elements in favor of default
   namespace on collection element
 - added spaces around subfield code and delimeter in to_s
 - fixed up relevant tests that were expecting old formatting
@@ -106,8 +122,8 @@ v0.0.8  Mon Jan 16 22:31:00 EST 2006
 v0.0.7  Mon Jan  2 21:39:28 CST 2006
 - MARC::XMLWriter added
-- removed encode/decode methods in MARC::MARC21 into MARC::Writer and
-  MARC::Reader respectively. This required pushing MARC21 specific constants
+- removed encode/decode methods in MARC::MARC21 into MARC::Writer and
+  MARC::Reader respectively. This required pushing MARC21 specific constants
   out into MARC::Constants which is required as necessary.
 - moved encode from MARC::MARXML into MARC::XMLWriter and added constants
   to MARC::Constants
@@ -137,4 +153,3 @@ v0.0.2  Mon Oct 17 17:42:57 CDT 2005
 v0.0.1  Mon Oct 10 10:29:20 CDT 2005
 - initial release

data/README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 [![Gem Version](https://badge.fury.io/rb/marc.png)](http://badge.fury.io/rb/marc)
-[![Build Status](https://secure.travis-ci.org/ruby-marc/ruby-marc.png)](http://travis-ci.org/ruby-marc/ruby-marc)
+![Build Status](https://github.com/ruby-marc/ruby-marc/workflows/CI/badge.svg) |
 marc is a ruby library for reading and writing MAchine Readable Cataloging
 (MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
@@ -34,7 +34,7 @@ marc is a ruby library for reading and writing MAchine Readable Cataloging
 MARC::Record provides `#to_hash` and `#from_hash` implementations that deal in ruby
 hash's that are compatible with the
-[marc-in-json](http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
+[marc-in-json](https://rossfsinger.com/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
 serialization format. You are responsible for serializing the hash to/from JSON yourself.
 ## Installation
@@ -56,7 +56,17 @@ Consult the MARC::Reader class docs for a more complete discussion and range of
 The MARC binary Writer (MARC::Writer) does not have any encoding-related features -- it's up to you the developer to make sure you create MARC::Records with consistent and expected char encodings, although MARC::Writer will write out a legal ISO 2709 either way, it just might have corrupted encodings.
+When parsing MARCXML _with Nokogiri as your XML parser implementation_ up to
+and including version `1.0.2` of this gem, if the XML was badly formed, parsing
+would stop and no error would be reported to your code.
+If you are using a version > `1.0.2` of `ruby-marc` with MRI + Nokogiri, XML
+syntax errors will be thrown (and you may need to adjust your code to account
+for this).  *JRuby users*: If you are using a version later than `1.0.2` and
+using Nokogiri as an XML parser with JRuby as your ruby implementation, XML
+syntax errors will still be ignored unless you have Nokogiri version `1.10.2`
+or later.
 ## Miscellany
 Source code at: https://github.com/ruby-marc/ruby-marc/
@@ -69,8 +79,6 @@ Developers, release new version of gem to rubygems with `rake release`
 (bundler-supplied task). Note that one nice thing this will do is automatically
 tag the version in git, very important for later figuring out what's going on.
-Please send bugs, requests and comments to Code4Lib Mailing list (https://listserv.nd.edu/cgi-bin/wa?A0=CODE4LIB).
 ## Authors
 Kevin Clarke <ksclarke@gmail.com>

data/lib/marc/datafield.rb CHANGED Viewed

@@ -50,7 +50,7 @@ module MARC
     def initialize(tag, i1=' ', i2=' ', *subfields)
       # if the tag is less than 3 characters long and
       # the string is all numeric then we pad with zeros
-      if tag.length < 3 and /^[0-9]*$/ =~ tag
+      if tag.length < 3 and /^[0-9]+$/ =~ tag
         @tag = "%03d" % tag
       else
         @tag = tag

data/lib/marc/marc8/to_unicode.rb CHANGED Viewed

@@ -12,12 +12,12 @@ module MARC
     # http://www.loc.gov/marc/specifications/speccharmarc8.html
     #
     # NOT thread-safe, it needs to keep state as it goes through a string,
-    # do not re-use between threads.
+    # do not re-use between threads.
     #
-    # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
+    # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
     #
     # Returns UTF-8 encoded string! Encode to something else if you want
-    # something else.
+    # something else.
     #
     # III proprietary code points?
     class ToUnicode
@@ -31,7 +31,7 @@ module MARC
       # These are state flags, MARC8 requires you to keep
       # track of 'current char sets' or something like that, which
-      # are changed with escape codes, or something like that.
+      # are changed with escape codes, or something like that.
       attr_accessor :g0, :g1
       def initialize
@@ -39,21 +39,21 @@ module MARC
         self.g1 = ANSEL
       end
-      # Returns UTF-8 encoded string equivalent of marc8_string passed in.
+      # Returns UTF-8 encoded string equivalent of marc8_string passed in.
       #
       # Bad Marc8 bytes?  By default will raise an Encoding::InvalidByteSequenceError
       # (will not have full metadata filled out, but will have a decent error message)
       #
       # Set option :invalid => :replace to instead silently replace bad bytes
-      # with a replacement char -- by default Unicode Replacement Char, but can set
-      # option :replace to something else, including empty string.
+      # with a replacement char -- by default Unicode Replacement Char, but can set
+      # option :replace to something else, including empty string.
       #
       # converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
       #
       # By default returns NFC normalized, but set :normalization option to:
       #    :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
       #    we won't do any normalization just take it as it comes out of the
-      #    transcode algorithm. This will generally NOT be composed.
+      #    transcode algorithm. This will generally NOT be composed.
       #
       # By default, escaped unicode 'named character references' in Marc8 will
       # be translated to actual UTF8. Eg. "&#x200F;" But pass :expand_ncr => false
@@ -61,21 +61,21 @@ module MARC
       #
       # String arg passed in WILL have it's encoding tagged 'binary' if
       # it's not already, if it's Marc8 there's no good reason for it not to
-      # be already.
+      # be already.
       def transcode(marc8_string, options = {})
         invalid_replacement     = options.fetch(:replace, "\uFFFD")
         expand_ncr              = options.fetch(:expand_ncr, true)
         normalization           = options.fetch(:normalization, :nfc)
         # don't choke on empty marc8_string
         return "" if marc8_string.nil? || marc8_string.empty?
         # Make sure to call it 'binary', so we can slice it
         # byte by byte, and so ruby doesn't complain about bad
         # bytes for some other encoding. Yeah, we're changing
         # encoding on input! If it's Marc8, it ought to be tagged
-        # binary already.
+        # binary already.
         marc8_string.force_encoding("binary")
         uni_list = []
@@ -124,7 +124,7 @@ module MARC
             end
             mb_flag = is_multibyte(self.g0)
             if mb_flag
                 code_point = (marc8_string[pos].ord * 65536 +
                      marc8_string[pos+1].ord * 256 +
@@ -134,7 +134,7 @@ module MARC
                 code_point = marc8_string[pos].ord
                 pos += 1
             end
             if (code_point < 0x20 or
                 (code_point > 0x80 and code_point < 0xa0))
                 uni = unichr(code_point)
@@ -144,7 +144,7 @@ module MARC
             begin
               code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
               (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
               if cflag
                   combinings.push unichr(uni)
               else
@@ -160,16 +160,16 @@ module MARC
                 uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
                 pos += 1
               else
-                raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
+                raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
               end
             end
         end
         # what to do if combining chars left over?
         uni_str = uni_list.join('')
         if expand_ncr
-          uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
+          uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
             [$1.hex].pack("U")
           end
         end
@@ -177,7 +177,7 @@ module MARC
         if normalization
           uni_str = UNF::Normalizer.normalize(uni_str, normalization)
         end
         return uni_str
       end
@@ -188,11 +188,11 @@ module MARC
       end
       # input single unicode codepoint as integer; output encoded as a UTF-8 string
-      # python has unichr built-in, we just define it for convenience no problem.
+      # python has unichr built-in, we just define it for convenience no problem.
       def unichr(code_point)
         [code_point].pack("U")
       end
     end
   end
-end
+end

data/lib/marc/reader.rb CHANGED Viewed

@@ -138,10 +138,10 @@ module MARC
   #    Encoding.default_internal = "utf-8"
   #    MARC::Reader.new(  File.new("marc_in_cp866.mrc", "r:cp866") )
   #
-  #    # However this shoudl be safe:
+  #    # However this should be safe:
   #    MARC::Reader.new(  "marc_in_cp866.mrc", :external_encoding => "cp866")
   #
-  #    # And this shoudl be safe, if you do want to transcode:
+  #    # And this should be safe, if you do want to transcode:
   #    MARC::Reader.new(  "marc_in_cp866.mrc", :external_encoding => "cp866",
   #       :internal_encoding => "utf-8")
   #
@@ -443,7 +443,11 @@ module MARC
         # get an exception from inside ruby-marc, and it may change
         # in future implementations.
         if params[:internal_encoding]
-          str = str.encode(params[:internal_encoding], params)
+          if RUBY_VERSION >= '3.0'
+            str = str.encode(params[:internal_encoding], **params)
+          else
+            str = str.encode(params[:internal_encoding], params)
+          end
         elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
           if params[:validate_encoding] == true && ! str.valid_encoding?

data/lib/marc/record.rb CHANGED Viewed

@@ -1,16 +1,17 @@
-module MARC
+module MARC
   # The FieldMap is an Array of DataFields and Controlfields.
-  # It also contains a Hash representation
+  # It also contains a Hash representation
   # of the fields for faster lookups (under certain conditions)
   class FieldMap < Array
     attr_reader :tags
     attr_accessor :clean
     def initialize
-      @tags = {}
+      @tags  = {}
       @clean = true
     end
     # Rebuild the HashWithChecksumAttribute with the current
     # values of the fields Array
     def reindex
@@ -21,28 +22,42 @@ module MARC
       end
       @clean = true
     end
     # Returns an array of all of the tags that appear in the record (not in the order they appear, however).
     def tag_list
       reindex unless @clean
       @tags.keys
     end
     # Returns an array of fields, in the order they appear, according to their tag.
     # The tags argument can be a string (e.g. '245'), an array (['100','700','800'])
     # or a range (('600'..'699')).
     def each_by_tag(tags)
       reindex unless @clean
-      indices = @tags.values_at(*(@tags.keys & [*tags])).flatten.sort
+      indices = []
+      # Get all the indices associated with the tags
+      Array(tags).each do |t|
+        indices.concat @tags[t] if @tags[t]
+      end
+      # Remove any nils
+      indices.compact!
       return [] if indices.empty?
-      self.values_at(*indices).each do |tag|
-        yield tag
+     # Sort it, so we get the fields back in the order they appear in the record
+      indices.sort!
+      indices.each do |tag|
+        yield self[tag]
       end
     end
-    # Freeze for immutability, first reindexing if needed.
+    # Freeze for immutability, first reindexing if needed.
     # A frozen FieldMap is safe for concurrent access, and also
-    # can more easily avoid accidental reindexing on even read-only use.
+    # can more easily avoid accidental reindexing on even read-only use.
     def freeze
       self.reindex unless @clean
       super
@@ -50,18 +65,18 @@ module MARC
   end
   # A class that represents an individual MARC record. Every record
-  # is made up of a collection of MARC::DataField objects.
+  # is made up of a collection of MARC::DataField objects.
   #
   # MARC::Record mixes in Enumerable to enable access to constituent
   # DataFields. For example, to return a list of all subject DataFields:
   #
-  #   record.find_all {|field| field.tag =~ /^6../}
-  #
+  #   record.find_all {|field| field.tag =~ /^6../}
+  #
   # The accessor 'fields' is also an Array of MARC::DataField objects which
   # the client can modify if neccesary.
   #
   #   record.fields.delete(field)
-  #
+  #
   # Other accessor attribute: 'leader' for record leader as String
   #
   # == High-performance lookup by tag
@@ -82,13 +97,13 @@ module MARC
   #
   # MARC::Record is not generally safe for sharing between threads.
   # Even if you think you are just acccessing it read-only,
-  # you may accidentally trigger a reindex of the by-tag cache (see above).
+  # you may accidentally trigger a reindex of the by-tag cache (see above).
   #
   # However, after you are done constructing a Record, you can mark
   # the `fields` array as immutable. This makes a Record safe for sharing
   # between threads for read-only use, and also helps you avoid accidentally
   # triggering a reindex, as accidental reindexes can harm by-tag
-  # lookup performance.
+  # lookup performance.
   #
   #     record.fields.freeze
   class Record
@@ -101,9 +116,9 @@ module MARC
     attr_accessor :leader
     def initialize
-      @fields = FieldMap.new
+      @fields         = FieldMap.new
       # leader is 24 bytes
-      @leader = ' ' * 24
+      @leader         = ' ' * 24
       # leader defaults:
       # http://www.loc.gov/marc/bibliographic/ecbdldrd.html
       @leader[10..11] = '22'
@@ -119,9 +134,9 @@ module MARC
     end
     # alias to append
     def <<(field)
-      append(field)
+      append(field)
     end
     # each() is here to support iterating and searching since MARC::Record
@@ -141,20 +156,20 @@ module MARC
         yield field
       end
     end
-    # A more convenient way to iterate over each field with a given tag.
+    # A more convenient way to iterate over each field with a given tag.
     # The filter argument can be a string, array or range.
     def each_by_tag(filter)
-      @fields.each_by_tag(filter) {|tag| yield tag }
+      @fields.each_by_tag(filter) { |tag| yield tag }
     end
     # You can lookup fields using this shorthand:
     #   title = record['245']
     def [](tag)
-      return self.find {|f| f.tag == tag}
+      return self.find { |f| f.tag == tag }
     end
     # Provides a backwards compatible means to access the FieldMap.
     # No argument returns the FieldMap array in entirety.  Providing
     # a string, array or range of tags will return an array of fields
@@ -163,9 +178,9 @@ module MARC
       unless filter
         # Since we're returning the FieldMap object, which the caller
         # may mutate, we precautionarily mark dirty -- unless it's frozen
-        # immutable.
+        # immutable.
         @fields.clean = false unless @fields.frozen?
-        return @fields
+        return @fields
       end
       @fields.reindex unless @fields.clean
       flds = []
@@ -180,18 +195,18 @@ module MARC
       end
       flds
     end
     # Returns an array of all of the tags that appear in the record (not necessarily in the order they appear).
     def tags
       return @fields.tag_list
     end
-    # Factory method for creating a MARC::Record from MARC21 in
+    # Factory method for creating a MARC::Record from MARC21 in
     # transmission format.
     #
     #   record = MARC::Record.new_from_marc(marc21)
     #
-    # in cases where you might be working with somewhat flawed
+    # in cases where you might be working with somewhat flawed
     # MARC data you may want to use the :forgiving parameter which
     # will bypass using field byte offsets and simply look for the
     # end of field byte to figure out the end of fields.
@@ -203,12 +218,12 @@ module MARC
     end
-    # Returns a record in MARC21 transmission format (ANSI Z39.2).
+    # Returns a record in MARC21 transmission format (ANSI Z39.2).
     # Really this is just a wrapper around MARC::MARC21::encode
     #
     #   marc = record.to_marc()
-    def to_marc
+    def to_marc
       return MARC::Writer.encode(self)
     end
@@ -235,51 +250,51 @@ module MARC
     # Return a marc-hash version of the record
     def to_marchash
       return {
-        'type' => 'marc-hash',
-        'version' => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION],
-        'leader' => self.leader,
-        'fields' => self.map {|f| f.to_marchash}
+          'type'    => 'marc-hash',
+          'version' => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION],
+          'leader'  => self.leader,
+          'fields'  => self.map { |f| f.to_marchash }
       }
-    end #to_hash
+    end
+    #to_hash
     # Factory method for creating a new MARC::Record from
     # a marchash object
     #
     # record = MARC::Record->new_from_marchash(mh)
     def self.new_from_marchash(mh)
-      r = self.new()
+      r        = self.new()
       r.leader = mh['leader']
       mh['fields'].each do |f|
-        if (f.length == 2)
+        if (f.length == 2)
           r << MARC::ControlField.new(f[0], f[1])
-        elsif
-          r << MARC::DataField.new(f[0], f[1], f[2], *f[3])
+        elsif r << MARC::DataField.new(f[0], f[1], f[2], *f[3])
         end
       end
       return r
     end
     # Returns a (roundtrippable) hash representation for MARC-in-JSON
     def to_hash
-      record_hash = {'leader'=>@leader, 'fields'=>[]}
+      record_hash = {'leader' => @leader, 'fields' => []}
       @fields.each do |field|
         record_hash['fields'] << field.to_hash
       end
       record_hash
-    end
+    end
     def self.new_from_hash(h)
-      r = self.new
+      r        = self.new
       r.leader = h['leader']
       if h['fields']
         h['fields'].each do |position|
           position.each_pair do |tag, field|
             if field.is_a?(Hash)
               f = MARC::DataField.new(tag, field['ind1'], field['ind2'])
-              field['subfields'].each do | pos |
+              field['subfields'].each do |pos|
                 pos.each_pair do |code, value|
                   f.append MARC::Subfield.new(code, value)
                 end
@@ -290,9 +305,10 @@ module MARC
             end
           end
         end
-      end
-      return r
+      end
+      return r
     end
     # Returns a string version of the record, suitable for printing
     def to_s
@@ -315,7 +331,7 @@ module MARC
     #   if record =~ /Gravity's Rainbow/ then print "Slothrop" end
     def =~(regex)
-      return self.to_s =~ regex
+      return self.to_s =~ regex
     end
   end

data/lib/marc/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MARC
-  VERSION = "1.0.0"
+  VERSION = "1.1.1"
 end

data/lib/marc/xml_parsers.rb CHANGED Viewed

@@ -1,4 +1,8 @@
 module MARC
+  # Exception class to be thrown when an XML parser
+  # encounters an unrecoverable error.
+  class XMLParseError < StandardError; end
   # The MagicReader will try to use the best available XML Parser at the
   # time of initialization.
   # The order is currently:
@@ -112,6 +116,10 @@ module MARC
         @parser.parse(@handle)
       end
     end
+    def error(evt)
+      raise(XMLParseError, "XML parsing error: #{evt}")
+    end
     def method_missing(methName, *args)
@@ -413,4 +421,4 @@ end
       end
     end # end of module
   end # end of if jruby
-end
+end

data/lib/marc/xmlwriter.rb CHANGED Viewed

@@ -61,6 +61,7 @@ module MARC
     def self.encode(record, opts={})
       singleChar = Regexp.new('[\da-z ]{1}')
+      subfieldChar = Regexp.new('[\dA-Za-z!"#$%&\'()*+,-./:;<=>?{}_^`~\[\]\\\]{1}')
       ctrlFieldTag = Regexp.new('00[1-9A-Za-z]{1}')
       # Right now, this writer handles input from the strict and
@@ -122,7 +123,7 @@ module MARC
             # If marc is leniently parsed, we may have some dirty data; using
             # the blank subfield code should help us locate these later to fix
-            if (subfield.code.match(singleChar) == nil)
+            if (subfield.code.match(subfieldChar) == nil)
               subfield.code = ' '
             end

data/test/marc8/tc_to_unicode.rb CHANGED Viewed

@@ -32,9 +32,9 @@ if "".respond_to?(:encoding)
     def test_lots_of_marc8_test_cases
       # Heap of test cases taken from pymarc, which provided these
-      # two data files, marc8 and utf8, with line-by-line correspondences.
+      # two data files, marc8 and utf8, with line-by-line correspondences.
       #
-      # For now, we have NOT included proprietary III encodings in our test data!
+      # For now, we have NOT included proprietary III encodings in our test data!
       utf8_file   = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
       marc8_file  = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
@@ -55,7 +55,7 @@ if "".respond_to?(:encoding)
           assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
         end
-      rescue EOFError => each
+      rescue EOFError => each
         # just means the file was over, no biggie
         assert i > 1500, "Read as many lines as we expected to, at least 1500"
       rescue Exception => e
@@ -82,27 +82,50 @@ if "".respond_to?(:encoding)
       assert_equal unicode_d,  converter.transcode(marc8, :normalization => :nfd)
       assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
-      # disable normalization for performance or something, we won't end up with NFC.
+      # disable normalization for performance or something, we won't end up with NFC.
       refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
     end
     def test_expand_ncr
       converter = MARC::Marc8::ToUnicode.new
       marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
       assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
       assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
-    end
+    end
     def test_bad_byte
       converter = MARC::Marc8::ToUnicode.new
       bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
       assert_raise(Encoding::InvalidByteSequenceError) {
-        value = converter.transcode(bad_marc8)
+        converter.transcode(bad_marc8)
       }
     end
+    def test_bad_byte_error_message
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      begin
+        converter.transcode(bad_marc8)
+      rescue Encoding::InvalidByteSequenceError => err
+        assert_equal("MARC8, input byte offset 30, code set: 0x31, code point: 0x7b3639, value: 米国の統治の仕組�", err.message)
+      end
+    end
+    def test_multiple_bad_byte_error_message
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      begin
+        converter.transcode(bad_marc8)
+      rescue Encoding::InvalidByteSequenceError => err
+        # It still identifies the first bad byte found in the offset info, but replaces all bad bytes in the error message
+        assert_equal("MARC8, input byte offset 21, code set: 0x31, code point: 0x7b3639, value: 統治の仕組� 米国の統治の仕組� 米国の統治の仕組�", err.message)
+      end
+    end
     def test_bad_byte_with_replacement
       converter = MARC::Marc8::ToUnicode.new
@@ -112,9 +135,9 @@ if "".respond_to?(:encoding)
       assert_equal "UTF-8", value.encoding.name
       assert value.valid_encoding?
-      assert value.include?("\uFFFD"), "includes replacement char"
+      assert value.include?("\uFFFD"), "includes replacement char"
       # coalescing multiple replacement chars at end, could change
-      # to not do so, important thing is at least one is there.
+      # to not do so, important thing is at least one is there.
       assert_equal "米国の統治の仕組�", value
     end
@@ -150,5 +173,5 @@ if "".respond_to?(:encoding)
   end
 else
   require 'pathname'
-  $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
+  $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
 end

data/test/tc_xml.rb CHANGED Viewed

@@ -140,11 +140,16 @@ class XMLTest < Test::Unit::TestCase
     record1 = MARC::Record.new
     record1.leader =  '00925njm  22002777a 4500'
     record1.append MARC::ControlField.new('007', 'sdubumennmplu')
-    record1.append MARC::DataField.new('245', '0', '4',
+    record1.append MARC::DataField.new('245', '0', '4',
       ['a', 'The Great Ray Charles'], ['h', '[sound recording].'])
+    record1.append MARC::DataField.new('998', ' ', ' ',
+      ['^', 'Valid local subfield'])
+    # MARC::XMLWriter mutates records
+    dup_record = MARC::Record.new_from_hash(record1.to_hash)
     writer = MARC::XMLWriter.new('test/test.xml', :stylesheet => 'style.xsl')
-    writer.write(record1)
+    writer.write(dup_record)
     writer.close
     xml = File.read('test/test.xml')

data/test/tc_xml_error_handling.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'test/unit'
+require 'marc'
+class BadXMLHandlingTestCase < Test::Unit::TestCase
+  def test_nokogiri_bad_xml
+    begin
+      require 'nokogiri'
+    rescue LoadError
+      omit("nokogiri not installed, cannot test")
+    end
+    omit("nokogiri (<1.10.2) under jruby doesn't support error handling: sparklemotion/nokogiri#1847") if RUBY_PLATFORM == 'java' && Gem::Version.new(Nokogiri::VERSION) < Gem::Version.new('1.10.2')
+    count = 0
+    reader = MARC::XMLReader.new('test/three-records-second-bad.xml', :parser => :nokogiri)
+    assert_raise MARC::XMLParseError do
+      reader.each do |rec|
+        count += 1 if rec['260']
+      end
+    end
+    assert_equal(1, count, 'should only be able to parse one record')
+  end
+end

data/test/three-records-second-bad.xml ADDED Viewed

@@ -0,0 +1,160 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/MARC21/slim" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
+    <record>
+        <leader>     njm a22     uu 4500</leader>
+        <controlfield tag="001">afc99990058366</controlfield>
+        <controlfield tag="003">DLC</controlfield>
+        <controlfield tag="005">20071104155141.9</controlfield>
+        <controlfield tag="007">sd ummunniauub</controlfield>
+        <controlfield tag="008">071103s1939    xxufmnne||||||||| u eng||</controlfield>
+        <datafield tag="010" ind1=" " ind2=" ">
+            <subfield code="a">afc99990058366</subfield>
+        </datafield>
+        <datafield tag="040" ind1=" " ind2=" ">
+            <subfield code="a">DLC</subfield>
+            <subfield code="c">DLC</subfield>
+        </datafield>
+        <datafield tag="245" ind1="0" ind2="4">
+            <subfield code="a">The Texas ranger</subfield>
+            <subfield code="h">[sound recording] /</subfield>
+            <subfield code="c">Sung by Beale D. Taylor.</subfield>
+        </datafield>
+        <datafield tag="260" ind1=" " ind2=" ">
+            <subfield code="a">Medina, Texas,</subfield>
+            <subfield code="c">1939.</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">1 sound disc :</subfield>
+            <subfield code="b">analog, 33 1/3 rpm, mono. ;</subfield>
+            <subfield code="c">12 in.</subfield>
+        </datafield>
+        <datafield tag="651" ind1=" " ind2="0">
+            <subfield code="a">Medina</subfield>
+            <subfield code="z">Texas</subfield>
+            <subfield code="z">United States of America.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Lomax, John Avery, 1867-1948</subfield>
+            <subfield code="e">Recording engineer.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Lomax, Ruby T. (Ruby Terrill)</subfield>
+            <subfield code="e">Recording engineer.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Taylor, Beale D.</subfield>
+            <subfield code="e">Singer.</subfield>
+        </datafield>
+        <datafield tag="852" ind1=" " ind2=" ">
+            <subfield code="a">American Folklife Center, Library of Congress</subfield>
+        </datafield>
+        <datafield tag="852" ind1=" " ind2=" ">
+            <subfield code="a">DLC</subfield>
+        </datafield>
+    </record>
+    <record>
+        <leader>     njm a22     uu 4500</leader>
+        <controlfield tag="001">afc99990058366</controlfield>
+        <controlfield tag="003">DLC</controlfield>
+        <controlfield tag="005">20071104155141.9</controlfield>
+        <controlfield tag="007">sd ummunniauub</controlfield>
+        <controlfield tag="008">071103s1939    xxufmnne||||||||| u eng||</controlfield>
+        <datafield tag="010" ind1=" " ind2=" ">
+            <subfield code="a">afc99990058366</subfield>
+        </datafield>
+        <datafield tag="040" ind1=" " ind2=" ">
+            <subfield code="a">DLC</subfield>
+            <subfield code="c">DLC</subfield>
+        </datafield>
+        <datafield tag="245" ind1="0" ind2="4">
+            <subfield code="a">The Texas ranger</subfield>
+            <!-- invalid utf-8 bytes in the non-printing subfield code -->
+            <subfield code="">[sound recording] /</subfield>
+            <subfield code="c">Sung by Beale D. Taylor.</subfield>
+        </datafield>
+        <datafield tag="260" ind1=" " ind2=" ">
+            <subfield code="a">Medina, Texas,</subfield>
+            <subfield code="c">1939.</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">1 sound disc :</subfield>
+            <subfield code="b">analog, 33 1/3 rpm, mono. ;</subfield>
+            <subfield code="c">12 in.</subfield>
+        </datafield>
+        <datafield tag="651" ind1=" " ind2="0">
+            <subfield code="a">Medina</subfield>
+            <subfield code="z">Texas</subfield>
+            <subfield code="z">United States of America.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Lomax, John Avery, 1867-1948</subfield>
+            <subfield code="e">Recording engineer.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Lomax, Ruby T. (Ruby Terrill)</subfield>
+            <subfield code="e">Recording engineer.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Taylor, Beale D.</subfield>
+            <subfield code="e">Singer.</subfield>
+        </datafield>
+        <datafield tag="852" ind1=" " ind2=" ">
+            <subfield code="a">American Folklife Center, Library of Congress</subfield>
+        </datafield>
+        <datafield tag="852" ind1=" " ind2=" ">
+            <subfield code="a">DLC</subfield>
+        </datafield>
+    </record>
+    <record>
+        <leader>     njm a22     uu 4500</leader>
+        <controlfield tag="001">afc99990058366</controlfield>
+        <controlfield tag="003">DLC</controlfield>
+        <controlfield tag="005">20071104155141.9</controlfield>
+        <controlfield tag="007">sd ummunniauub</controlfield>
+        <controlfield tag="008">071103s1939    xxufmnne||||||||| u eng||</controlfield>
+        <datafield tag="010" ind1=" " ind2=" ">
+            <subfield code="a">afc99990058366</subfield>
+        </datafield>
+        <datafield tag="040" ind1=" " ind2=" ">
+            <subfield code="a">DLC</subfield>
+            <subfield code="c">DLC</subfield>
+        </datafield>
+        <datafield tag="245" ind1="0" ind2="4">
+            <subfield code="a">The Texas ranger</subfield>
+            <subfield code="h">[sound recording] /</subfield>
+            <subfield code="c">Sung by Beale D. Taylor.</subfield>
+        </datafield>
+        <datafield tag="260" ind1=" " ind2=" ">
+            <subfield code="a">Medina, Texas,</subfield>
+            <subfield code="c">1939.</subfield>
+        </datafield>
+        <datafield tag="300" ind1=" " ind2=" ">
+            <subfield code="a">1 sound disc :</subfield>
+            <subfield code="b">analog, 33 1/3 rpm, mono. ;</subfield>
+            <subfield code="c">12 in.</subfield>
+        </datafield>
+        <datafield tag="651" ind1=" " ind2="0">
+            <subfield code="a">Medina</subfield>
+            <subfield code="z">Texas</subfield>
+            <subfield code="z">United States of America.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Lomax, John Avery, 1867-1948</subfield>
+            <subfield code="e">Recording engineer.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Lomax, Ruby T. (Ruby Terrill)</subfield>
+            <subfield code="e">Recording engineer.</subfield>
+        </datafield>
+        <datafield tag="700" ind1="1" ind2=" ">
+            <subfield code="a">Taylor, Beale D.</subfield>
+            <subfield code="e">Singer.</subfield>
+        </datafield>
+        <datafield tag="852" ind1=" " ind2=" ">
+            <subfield code="a">American Folklife Center, Library of Congress</subfield>
+        </datafield>
+        <datafield tag="852" ind1=" " ind2=" ">
+            <subfield code="a">DLC</subfield>
+        </datafield>
+    </record>
+</collection>

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: marc
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.1.1
 platform: ruby
 authors:
 - Kevin Clarke
@@ -13,54 +13,74 @@ authors:
 autorequire: marc
 bindir: bin
 cert_chain: []
-date: 2015-01-28 00:00:00.000000000 Z
+date: 2021-06-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
+  name: scrub_rb
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.0.1
-    - - <
+    - - "<"
       - !ruby/object:Gem::Version
         version: '2'
-  name: scrub_rb
-  prerelease: false
   type: :runtime
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.0.1
-    - - <
+    - - "<"
       - !ruby/object:Gem::Version
         version: '2'
 - !ruby/object:Gem::Dependency
+  name: unf
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-  name: unf
+  type: :runtime
   prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rexml
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
   type: :runtime
+  prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '0'
-description:
+description:
 email: ehs@pobox.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- Changes
+- LICENSE
+- README.md
+- Rakefile
 - lib/marc.rb
 - lib/marc/constants.rb
 - lib/marc/controlfield.rb
 - lib/marc/datafield.rb
 - lib/marc/dublincore.rb
 - lib/marc/exception.rb
+- lib/marc/marc8/map_to_unicode.rb
+- lib/marc/marc8/to_unicode.rb
 - lib/marc/reader.rb
 - lib/marc/record.rb
 - lib/marc/subfield.rb
@@ -69,14 +89,16 @@ files:
 - lib/marc/xml_parsers.rb
 - lib/marc/xmlreader.rb
 - lib/marc/xmlwriter.rb
-- lib/marc/marc8/map_to_unicode.rb
-- lib/marc/marc8/to_unicode.rb
 - test/bad_eacc_encoding.marc8.marc
 - test/batch.dat
 - test/batch.xml
 - test/cp866_multirecord.marc
 - test/cp866_unimarc.marc
 - test/escaped_character_reference.marc8.marc
+- test/marc8/data/test_marc8.txt
+- test/marc8/data/test_utf8.txt
+- test/marc8/tc_marc8_mapping.rb
+- test/marc8/tc_to_unicode.rb
 - test/marc8_accented_chars.marc
 - test/marc_with_bad_utf8.utf8.marc
 - test/no-leading-zero.xml
@@ -98,42 +120,34 @@ files:
 - test/tc_subfield.rb
 - test/tc_writer.rb
 - test/tc_xml.rb
+- test/tc_xml_error_handling.rb
+- test/three-records-second-bad.xml
 - test/ts_marc.rb
 - test/utf8.marc
 - test/utf8_multirecord.marc
 - test/utf8_with_bad_bytes.marc
-- test/marc8/tc_marc8_mapping.rb
-- test/marc8/tc_to_unicode.rb
-- test/marc8/data/test_marc8.txt
-- test/marc8/data/test_utf8.txt
-- Rakefile
-- README.md
-- Changes
-- LICENSE
 homepage: https://github.com/ruby-marc/ruby-marc/
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: 1.8.6
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.1.9
-signing_key:
+rubygems_version: 3.0.3
+signing_key:
 specification_version: 4
 summary: A ruby library for working with Machine Readable Cataloging
 test_files:
 - test/ts_marc.rb
-has_rdoc: true