RubyGems - marc - Versions diffs - 0.6.0 → 0.7.0 - Mend

marc 0.6.0 → 0.7.0

Files changed (8) hide show

data/README.md +1 -0
data/lib/marc/reader.rb +1 -1
data/lib/marc/version.rb +1 -1
data/lib/marc/writer-NEW.rb +108 -0
data/lib/marc/writer.rb +62 -17
data/test/tc_reader_char_encodings.rb +2 -0
data/test/tc_writer.rb +98 -3
metadata +3 -2

data/README.md CHANGED Viewed

@@ -1,3 +1,4 @@
+[![Gem Version](https://badge.fury.io/rb/marc.png)](http://badge.fury.io/rb/marc)
 [![Build Status](https://secure.travis-ci.org/ruby-marc/ruby-marc.png)](http://travis-ci.org/ruby-marc/ruby-marc)
 marc is a ruby library for reading and writing MAchine Readable Cataloging

data/lib/marc/reader.rb CHANGED Viewed

@@ -396,7 +396,7 @@ module MARC
   # Like Reader ForgivingReader lets you read in a batch of MARC21 records
   # but it does not use record lengths and field byte offets found in the
   # leader and directory. It is not unusual to run across MARC records
-  # which have had their offsets calcualted wrong. In situations like this
+  # which have had their offsets calculated wrong. In situations like this
   # the vanilla Reader may fail, and you can try to use ForgivingReader.
   #
   # The one downside to this is that ForgivingReader will assume that the

data/lib/marc/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module MARC
-  VERSION = "0.6.0"
+  VERSION = "0.7.0"
 end

data/lib/marc/writer-NEW.rb ADDED Viewed

@@ -0,0 +1,108 @@
+module MARC
+  # A class for writing MARC records as MARC21.
+  class Writer
+    # the constructor which you must pass a file path
+    # or an object that responds to a write message
+    def initialize(file)
+      if file.class == String
+        @fh = File.new(file,"w")
+      elsif file.respond_to?('write')
+        @fh = file
+      else
+        throw "must pass in file name or handle"
+      end
+    end
+    # write a record to the file or handle
+    def write(record)
+      @fh.write(MARC::Writer.encode(record))
+    end
+    # close underlying filehandle
+    def close
+      @fh.close
+    end
+    # a static method that accepts a MARC::Record object
+    # and returns the record encoded as MARC21 in transmission format
+    def self.encode(record)
+      directory = ''
+      fields = ''
+      offset = 0
+      for field in record.fields
+        # encode the field
+        field_data = ''
+        if field.class == MARC::DataField
+          warn("Warn:  Missing indicator") unless field.indicator1 && field.indicator2
+          field_data = (field.indicator1 || " ") + (field.indicator2 || " ")
+          for s in field.subfields
+            field_data += SUBFIELD_INDICATOR + s.code + s.value
+          end
+        elsif field.class == MARC::ControlField
+          field_data = field.value
+        end
+        field_data += END_OF_FIELD
+        # calculate directory entry for the field
+        field_length = (field_data.respond_to?(:bytesize) ?
+          field_data.bytesize() :
+          field_data.length())
+        directory += sprintf("%03s", field.tag) + format_byte_count(field_length, 4) + format_byte_count(offset)
+        # add field to data for other fields
+        fields += field_data
+        # update offset for next field
+        offset += field_length
+      end
+      # determine the base (leader + directory)
+      base = record.leader + directory + END_OF_FIELD
+      # determine complete record
+      marc = base + fields + END_OF_RECORD
+      # update leader with the byte offest to the end of the directory
+      marc[12..16] = format_byte_count(base.respond_to?(:bytesize) ?
+        base.bytesize() :
+        base.length()
+      )
+      # update the record length
+      marc[0..4] = format_byte_count(marc.respond_to?(:bytesize) ?
+        marc.bytesize() :
+        marc.length()
+      )
+      # store updated leader in the record that was passed in
+      record.leader = marc[0..LEADER_LENGTH-1]
+      # return encoded marc
+      return marc
+    end
+    def self.format_byte_count(number, num_digits=5)
+      formatted = sprintf("%0#{num_digits}i", number)
+      if formatted.length > num_digits
+        # uh, oh, we've exceeded our max. Either zero out
+        # or raise, depending on settings.
+        #formatted = sprintf("%0#{num_digits}i", "")
+        formatted = "9" * num_digits
+      end
+      return formatted
+    end
+  end
+end

data/lib/marc/writer.rb CHANGED Viewed

@@ -1,8 +1,30 @@
 module MARC
-  # A class for writing MARC records as MARC21.
+  # A class for writing MARC records as binary MARC (ISO 2709)
+  #
+  # == Too-long records
+  #
+  # The MARC binary format only allows records that are total 99999 bytes long,
+  # due to size of a length field in the record.
+  #
+  # By default, the Writer will raise a MARC::Exception when encountering
+  # in-memory records that are too long to be legally written out as ISO 2709
+  # binary.
+  # However, if you set `allow_oversized` to true, then the Writer will
+  # write these records out anyway, filling in any binary length/offset slots
+  # with all 0's, if they are not wide enough to hold the true value.
+  # While these records are illegal, they can still be read back in using
+  # the MARC::ForgivingReader, as well as other platform MARC readers
+  # in tolerant mode.
+  #
+  # If you set `allow_oversized` to false on the Writer, a MARC::Exception
+  # will be raised instead, if you try to write an oversized record.
+  #
+  #    writer = Writer.new(some_path)
+  #    writer.allow_oversized = true
   class Writer
+    attr_accessor :allow_oversized
     # the constructor which you must pass a file path
     # or an object that responds to a write message
@@ -15,13 +37,14 @@ module MARC
       else
         throw "must pass in file name or handle"
       end
+      self.allow_oversized = false
     end
     # write a record to the file or handle
     def write(record)
-      @fh.write(MARC::Writer.encode(record))
+      @fh.write(MARC::Writer.encode(record, self.allow_oversized))
     end
@@ -34,8 +57,10 @@ module MARC
     # a static method that accepts a MARC::Record object
     # and returns the record encoded as MARC21 in transmission format
-    def self.encode(record)
+    #
+    # Second arg allow_oversized, default false, set to true
+    # to raise on MARC record that can't fit into ISO 2709.
+    def self.encode(record, allow_oversized = false)
       directory = ''
       fields = ''
       offset = 0
@@ -58,8 +83,8 @@ module MARC
         field_length = (field_data.respond_to?(:bytesize) ?
           field_data.bytesize() :
           field_data.length())
-        directory += sprintf("%03s%04i%05i", field.tag, field_length,
-          offset)
+        directory += sprintf("%03s", field.tag) + format_byte_count(field_length, allow_oversized, 4) + format_byte_count(offset, allow_oversized)
         # add field to data for other fields
         fields += field_data
@@ -75,22 +100,42 @@ module MARC
       marc = base + fields + END_OF_RECORD
       # update leader with the byte offest to the end of the directory
-      marc[12..16] = sprintf("%05i", (base.respond_to?(:bytesize) ?
-        base.bytesize() :
-        base.length() )
-      )
+      bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length()
+      marc[12..16] = format_byte_count(bytesize, allow_oversized)
       # update the record length
-      marc[0..4] = sprintf("%05i", (marc.respond_to?(:bytesize) ?
-        marc.bytesize() :
-        marc.length())
-      )
+      bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length()
+      marc[0..4] = format_byte_count(bytesize, allow_oversized)
       # store updated leader in the record that was passed in
       record.leader = marc[0..LEADER_LENGTH-1]
       # return encoded marc
-      return marc
+      return marc
     end
+    # Formats numbers for insertion into marc binary slots.
+    # These slots only allow so many digits (and need to be left-padded
+    # with spaces to that number of digits). If the number
+    # is too big, either an exception will be raised, or
+    # we'll return all 0's to proper number of digits.
+    #
+    # first arg is number, second is boolean whether to allow oversized,
+    # third is max digits (default 5)
+    def self.format_byte_count(number, allow_oversized, num_digits=5)
+      formatted = sprintf("%0#{num_digits}i", number)
+      if formatted.length > num_digits
+        # uh, oh, we've exceeded our max. Either zero out
+        # or raise, depending on settings.
+        if allow_oversized
+          formatted = sprintf("%0#{num_digits}i", 0)
+        else
+          raise MARC::Exception.new("Can't write MARC record, as length/offset value of #{number} is too long for the #{num_digits} slot in binary format.")
+        end
+      end
+      return formatted
+    end
   end
 end

data/test/tc_reader_char_encodings.rb CHANGED Viewed

@@ -3,6 +3,8 @@
 require 'test/unit'
 require 'marc'
+require 'stringio'
 # Testing char encodings under 1.9, don't bother running
 # these tests except under 1.9, will either fail (because
 # 1.9 func the test itself uses isn't there), or trivially pass

data/test/tc_writer.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 require 'test/unit'
 require 'marc'
+require 'stringio'
 class WriterTest < Test::Unit::TestCase
     def test_writer
@@ -19,6 +21,99 @@ class WriterTest < Test::Unit::TestCase
         # cleanup
         File.unlink('test/writer.dat')
     end
+    # Only in ruby 1.9
+    if "".respond_to?(:encoding)
+      def test_writer_bad_encoding
+        writer = MARC::Writer.new('test/writer.dat')
+        # MARC::Writer should just happily write out whatever bytes you give it, even
+        # mixing encodings that can't be mixed. We ran into an actual example mixing
+        # MARC8 (tagged ruby binary) and UTF8, we want it to be written out.
+        record = MARC::Record.new
+        record.append MARC::DataField.new('700', '0', ' ', ['a', "Nhouy Abhay,".force_encoding("BINARY")], ["c", "Th\xE5ao,".force_encoding("BINARY")], ["d", "1909-"])
+        record.append MARC::DataField.new('700', '0', ' ', ['a', "Somchin P\xF8\xE5o. Ngin,".force_encoding("BINARY")])
+        record.append MARC::DataField.new('100', '0', '0', ['a', "\xE5angkham. ".force_encoding("BINARY")])
+        record.append MARC::DataField.new('245', '1', '0', ['b', "chef-d'oeuvre de la litt\xE2erature lao".force_encoding("BINARY")])
+        # One in UTF8 and marked
+        record.append MARC::DataField.new('999', '0', '1', ['a', "chef-d'ocuvre de la littU+FFC3\U+FFA9rature".force_encoding("UTF-8")])
+        writer.write(record)
+        writer.close
+      ensure
+          File.unlink('test/writer.dat')
+      end
+    end
+    def test_write_too_long_iso2709
+      too_long_record = MARC::Record.new
+      1.upto(1001) do
+        too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
+      end
+      wbuffer = StringIO.new("", "w")
+      writer = MARC::Writer.new(wbuffer)
+      writer.allow_oversized = true
+      writer.write(too_long_record)
+      writer.close
+      assert_equal "00000", wbuffer.string.slice(0, 5), "zero'd out length bytes when too long"
+      rbuffer = StringIO.new(wbuffer.string.dup)
+      # Regular reader won't read our illegal record.
+      #assert_raise(NoMethodError) do
+      #  reader = MARC::Reader.new(rbuffer)
+      #  reader.first
+      #end
+      # Forgiving reader will, round trippable
+      new_record = MARC::Reader.decode(rbuffer.string, :forgiving => true)
+      assert_equal too_long_record, new_record, "Too long record round-trippable with forgiving mode"
+      # Test in the middle of a MARC file
+      good_record = MARC::Record.new
+      good_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A short record'])
+      wbuffer = StringIO.new("", "w")
+      writer = MARC::Writer.new(wbuffer)
+      writer.allow_oversized = true
+      writer.write(good_record)
+      writer.write(too_long_record)
+      writer.write(good_record)
+      rbuffer = StringIO.new(wbuffer.string.dup)
+      reader  = MARC::ForgivingReader.new(rbuffer)
+      records = reader.to_a
+      assert_equal 3, records.length
+      assert_equal good_record, records[0]
+      assert_equal good_record, records[2]
+      assert_equal too_long_record, records[1]
+    end
+    def test_raises_on_too_long_if_configured
+      too_long_record = MARC::Record.new
+      1.upto(1001) do
+        too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
+      end
+      wbuffer = StringIO.new("", "w")
+      writer = MARC::Writer.new(wbuffer)
+      assert_raise(MARC::Exception) do
+        writer.write too_long_record
+      end
+    end
     def test_forgiving_writer
       marc = "00305cam a2200133 a 4500001000700000003000900007005001700016008004100033008004100074035002500115245001700140909001000157909000400167\036635145\036UK-BiLMS\03620060329173705.0\036s1982iieng6                  000 0 eng||\036060116|||||||||xxk                 eng||\036  \037a(UK-BiLMS)M0017366ZW\03600\037aTest record.\036  \037aa\037b\037c\036\037b0\036\035\000"
@@ -29,18 +124,18 @@ class WriterTest < Test::Unit::TestCase
     end
     def test_unicode_roundtrip
-      record = MARC::Reader.new('test/utf8.marc').first
+      record = MARC::Reader.new('test/utf8.marc', :external_encoding => "UTF-8").first
       writer = MARC::Writer.new('test/writer.dat')
       writer.write(record)
       writer.close
-      read_back_record = MARC::Reader.new('test/writer.dat').first
+      read_back_record = MARC::Reader.new('test/writer.dat', :external_encoding => "UTF-8").first
       # Make sure the one we wrote out then read in again
       # is the same as the one we read the first time
       # Looks like "==" is over-ridden to do that. Don't ever change, #==
-      assert (record == read_back_record), "Round-tripped record must equal original record"
+      assert_equal record, read_back_record, "Round-tripped record must equal original record"
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: marc
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.7.0
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ authors:
 autorequire: marc
 bindir: bin
 cert_chain: []
-date: 2013-08-19 00:00:00.000000000 Z
+date: 2013-09-03 00:00:00.000000000 Z
 dependencies: []
 description:
 email: ehs@pobox.com
@@ -30,6 +30,7 @@ files:
 - lib/marc/record.rb
 - lib/marc/subfield.rb
 - lib/marc/version.rb
+- lib/marc/writer-NEW.rb
 - lib/marc/writer.rb
 - lib/marc/xml_parsers.rb
 - lib/marc/xmlreader.rb