RubyGems - marc - Versions diffs - 0.4.4 → 0.5.0 - Mend

marc 0.4.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

data/Changes +13 -0
data/README.md +88 -0
data/Rakefile +2 -26
data/lib/marc.rb +1 -1
data/lib/marc/reader.rb +270 -50
data/lib/marc/version.rb +3 -0
data/lib/marc/writer.rb +11 -3
data/test/bare_cp866.txt +1 -0
data/test/cp866_multirecord.marc +1 -0
data/test/cp866_unimarc.marc +1 -0
data/test/jruby_bad_transcode.rb +52 -0
data/test/jruby_just_string.rb +39 -0
data/test/marc8_accented_chars.marc +1 -0
data/test/tc_bare_ruby_strings.rb +43 -0
data/test/tc_reader.rb +21 -6
data/test/tc_reader_char_encodings.rb +256 -0
data/test/tc_writer.rb +14 -2
data/test/test_cp866.txt +1 -0
data/test/{000039829.marc → utf8.marc} +0 -0
data/test/utf8_multirecord.marc +1 -0
data/test/utf8_with_bad_bytes.marc +1 -0
metadata +73 -41
data/README +0 -55
data/test/t +0 -1

data/lib/marc/version.rb ADDED

@@ -0,0 +1,3 @@
+module MARC
+  VERSION = "0.5.0"
+end

data/lib/marc/writer.rb CHANGED

@@ -55,7 +55,9 @@ module MARC
         field_data += END_OF_FIELD
         # calculate directory entry for the field
-        field_length = field_data.length()
+        field_length = (field_data.respond_to?(:bytesize) ?
+          field_data.bytesize() :
+          field_data.length())
         directory += sprintf("%03s%04i%05i", field.tag, field_length,
           offset)
@@ -73,10 +75,16 @@ module MARC
       marc = base + fields + END_OF_RECORD
       # update leader with the byte offest to the end of the directory
-      marc[12..16] = sprintf("%05i", base.length())
+      marc[12..16] = sprintf("%05i", (base.respond_to?(:bytesize) ?
+        base.bytesize() :
+        base.length() )
+      )
       # update the record length
-      marc[0..4] = sprintf("%05i", marc.length())
+      marc[0..4] = sprintf("%05i", (marc.respond_to?(:bytesize) ?
+        marc.bytesize() :
+        marc.length())
+      )
       # store updated leader in the record that was passed in
       record.leader = marc[0..LEADER_LENGTH-1]

data/test/bare_cp866.txt ADDED

	@@ -0,0 +1 @@
1	+ ��㭠�. �� ⬠��

data/test/cp866_multirecord.marc ADDED

@@ -0,0 +1 @@

+ 011670000000002290004500001001600000003003300016004000500049005001200054006003400066007000500100021012500105035000300230036001000233042000300243043001000246076000800256100064200264501000500906504000900911507000300920514001400923��⥭�� . �.��㭠�. �� ⬠��.07.01-01�.1�ਡ��஥��%��१�஢��2005��⮬�⨧�� ஥��஢�� ⮢�� ந��⢠ ��㪮��த�樨 � �� ଠ樮�� ࠭�⢥ �।��01501.01.13RU�. 56, 58~N 9-10�࣠��஬ ᥬ�� 㯨�� _5��-��஭_6, �� ७�� ଠ樮�� 孮�� ᨩ᪮� �஬�諥��. �।�⠢�� ஥�� CRAFT (Cooperative Research Action for Technology - ᮢ�� ஥�� ᫥�� ᮧ�� 孮��), ��䨭��஢�� , � ��஬ �� ஫� ��ࠫ� �த�� Cimatron. �� ஥�� 뫮 ࠧ��⨥ �� 孮�� ।�� १�஢��. ��室�� ࠡ�� 뫠 ��᫮�� ⥬, �� ᥣ�� ⥩ (��ਬ��, ��ࣨ�, ��⨪�, ᥭ�ਪ�, � ��㣨�) �ॡ�� ਬ�� ⬠�ᮢ�� ⠫�� ᪮��᪨� ࠧ��஢AB0150.01.1301AB2007-��011670000000002290004500001001600000003003300016004000500049005001200054006003400066007000500100021012500105035000300230036001000233042000300243043001000246076000800256100064200264501000500906504000900911507000300920514001400923��⥭�� . �.��㭠�. �� ⬠��.07.01-01�.1�ਡ��஥��%��१�஢��2005��⮬�⨧�� ஥��஢�� ⮢�� ந��⢠ ��㪮��த�樨 � �� ଠ樮�� ࠭�⢥ �।��01501.01.13RU�. 56, 58~N 9-10�࣠��஬ ᥬ�� 㯨�� _5��-��஭_6, �� ७�� ଠ樮�� 孮�� ᨩ᪮� �஬�諥��. �।�⠢�� ஥�� CRAFT (Cooperative Research Action for Technology - ᮢ�� ஥�� ᫥�� ᮧ�� 孮��), ��䨭��஢�� , � ��஬ �� ஫� ��ࠫ� �த�� Cimatron. �� ஥�� 뫮 ࠧ��⨥ �� 孮�� ।�� १�஢��. ��室�� ࠡ�� 뫠 ��᫮�� ⥬, �� ᥣ�� ⥩ (��ਬ��, ��ࣨ�, ��⨪�, ᥭ�ਪ�, � ��㣨�) �ॡ�� ਬ�� ⬠�ᮢ�� ⠫�� ᪮��᪨� ࠧ��஢AB0150.01.1301AB2007-��

data/test/cp866_unimarc.marc ADDED

@@ -0,0 +1 @@

data/test/jruby_bad_transcode.rb ADDED

@@ -0,0 +1,52 @@
+# encoding: utf-8
+# 1.9.3p0 :005 > 0x8D.chr.force_encoding("cp866").encode("UTF-8")
+utf8 = "Н".force_encoding("UTF-8")
+puts "There's a cyrillic letter that looks kinda like a capital H. Here's what it looks like in unicode: Н"
+puts "In unicode, that's byte array: " + utf8.bytes.to_a.inspect
+puts "We're gonna use String#encode to convert it to an IBM866 encoding, also known as cp866, an encoding sometimes used in Russia."
+puts "  `utf8.encode(\"IBM866\")`"
+cp866 = utf8.encode("IBM866")
+puts cp866.bytes.to_a.inspect
+exit
+puts
+puts "In cp866, the actual bytes are: #{cp866_phrase.bytes.to_a.inspect}"
+puts
+puts "We're going to write the cp866 string to disk, using binary:binary to try and make sure we get the bytes to disk without transcoding."
+write = File.open("test_cp866.txt", "w", :internal_encoding => "binary", :external_encoding => "binary")
+write.puts cp866_phrase
+write.close
+puts
+puts "Now we're going to read it in with a File object with external_encoding set to IBM866, but no internal_encoding set."
+puts
+puts "Make sure we have no default internal_encoding: " + Encoding.default_internal.nil?.inspect
+read = File.open("test_cp866.txt", :external_encoding => "cp866")
+puts
+puts "Our ruby file object should have external_encoding of IBM866: " + read.external_encoding.inspect
+puts "  and internal_encoding nil: " + read.internal_encoding.inspect
+puts
+read_in_string = read.read
+read.close
+puts "The encoding of the string we read in should be IBM866: " + (read_in_string.encoding.name == "IBM866").inspect
+puts
+puts "And the bytes should be the very same bytes we wrote out (which are valid cp866) " + (read_in_string.bytes.to_a[0,3] == [140, 165, 166]).inspect + " (#{read_in_string.bytes.to_a})"
+puts "The above is TRUE in MRI 1.9.3, but FALSE in jruby "

data/test/jruby_just_string.rb ADDED

@@ -0,0 +1,39 @@
+# encoding: binary
+# jruby 1.6.7 (ruby-1.9.2-p312) (2012-02-22 3e82bc8) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_20) [linux-amd64-java]
+# There is a letter in cyrillic that looks kind of like a capital
+# H.  In the cp866 encoding (http://en.wikipedia.org/wiki/Code_page_866)
+# it's represented by "\x8D" which is decimal 141.
+#
+# In ruby 1.9, it _ought_ to be possible to have those bytes
+# in a string, and tell ruby it's cp866.
+cp866 = "\x8D".force_encoding("IBM866")
+# in MRI 1.9.3, if we inspect that, we get "\x8D", just like we expect.
+# and if we look at #bytes.to_a, we get [141], just like we expect.
+puts cp866.inspect
+puts cp866.bytes.to_a.inspect
+# However, in jruby if we #inspect instead of getting "\x8D",
+# we get "\u008D" -- this is wrong, it's NOT that unicode codepoint.
+# In jruby, bytes.to_a.inspect is still [141], it hasn't changed
+# the bytes, but it's confused about what's going on.
+# We see this encoding confusion demonstrated if we try
+# a String#encode.
+#
+# MRI 1.9.3 is perfectly capable of transcoding this to UTF-8
+utf8 = cp866.encode("UTF-8")
+puts utf8.inspect # =>  in MRI displays cyrillic in terminal no prob
+puts utf8.bytes.to_a.inspect # => in MRI [208, 157], proper bytes for utf8
+# In jruby, puts utf8.inspect displays "\u008D", and
+# utf8.bytes.to_a.inspect is [194, 141]. I don't know where the
+# 191 came from, but it has NOT succesfully transcoded to utf8.
+# In other cases, the #encode will actually raise an illegal byte
+# exception if the original bytes were not legal for UTF8 (or UTF16?) --
+# but the original bytes were not meant to be considered unicode at all.

data/test/marc8_accented_chars.marc ADDED

@@ -0,0 +1 @@

+ 01120nam a22003011 4500001001000000003000400010005001700014008004100031010001300072035002400085035002400109035002300133040002400156100002500180245003700205260003100242300001100273490002700284504003100311505032600342650005500668852003300723970001300756971000800769972001300777973001900790998000900809000004951MiU19880715000000.0880715|1966||||||| |||||||fre|u a67006971 a(RLIN)MIUG0344054-B a(CaOTULAS)159823738 a(OCoLC)ocm00344054 cODaWUdMiUdCStRLIN1 aSerreau, Genevi�eve.10aHistoire du "nouveau th�e�atre." a[Paris]bGallimardcc1966. a190 p.0 aCollection Id�ees, 104 aBibliographical footnotes.0 aQuelques vivants piliers.--L'imm�ediat apr�es-guerre.--La f�ete des mots.--Eug�ene Ionesco.--Arthur Adamov.--Samuel Beckett.--Jean Genet.--Jean Vauthier.--Georges Schehad�e.--La rel�eve de l'avant-garde.--Les metteurs en sc�ene du "nouveau th�e�atre."--Cr�eations des principaux metteurs en sc�ene du "nouveau th�e�atre." 0aFrench dramay20th centuryxHistory and criticism.1 aMiUbBUHRcGRADh842 S4817hi aBKbBook aMiU c20040625 aACbavail_circ s9665

data/test/tc_bare_ruby_strings.rb ADDED

@@ -0,0 +1,43 @@
+require 'test/unit'
+class TestBareRubyStrings < Test::Unit::TestCase
+ # The file bare_cp866.txt has in it a phrase encoded in cp866,
+ # that if it were translated to utf8 would be:
+ # "Междунар. новости мира пластмасс\n"
+ #
+ # The first few bytes of that in utf8 are:
+ # "\xD0\x9C\xD0\xB5"
+ #
+ # In cp866 as it is on disk, it's first few bytes are "\x8C\xA5"
+ def test_read_cp866_with_external_encoding
+   return
+   file = File.open("test/bare_cp866.txt", "r:cp866")
+   string = file.read
+   assert_equal "IBM866", string.encoding.name
+   cp866_binary = string.dup.force_encoding("binary")
+   assert cp866_binary.start_with?( "\x8C\xA5".force_encoding("binary")  )
+   transcoded = string.encode("UTF-8")
+   assert_equal "UTF-8", transcoded.encoding.name
+   utf8_binary = transcoded.dup.force_encoding("binary")
+   assert utf8_binary.start_with?( "\xD0\x9C\xD0\xB5".force_encoding("binary"))
+ end
+ def test_read_cp866_binary_all_the_way
+   # tell ruby to treat it as binary binary binary
+   file = File.open("test/bare_cp866.txt", :external_encoding => "binary", :internal_encoding => "binary")
+   string = file.read
+   # we should get the same bytes that were on disk, right?
+   assert string.start_with?( "\x8C\xA5".force_encoding("binary"))
+ end
+end

data/test/tc_reader.rb CHANGED

@@ -1,3 +1,5 @@
+# -*- encoding: utf-8 -*-
 require 'test/unit'
 require 'marc'
@@ -17,12 +19,30 @@ class ReaderTest < Test::Unit::TestCase
     assert_equal(10, count)
   end
+  def test_loose_utf8
+    # This isn't actually a corrupt file, but it is utf8,
+    # and I have some reason to believe forgiving reader isn't
+    # working properly with UTF8 in ruby 1.9, so testing it.
+    reader = MARC::ForgivingReader.new('test/utf8.marc')
+    count = 0
+    reader.each { count += 1 }
+    assert_equal(1, count)
+  end
+  def test_loose_unimarc
+    # Unimarc might use a different record seperator? Let's make sure it works.
+    reader = MARC::Reader.new(File.open('test/cp866_unimarc.marc', 'r:cp866'))
+    count = 0
+    reader.each {|a| count += 1 }
+    assert_equal(1, count)
+  end
   def test_non_numeric_tags
     reader = MARC::Reader.new('test/non-numeric.dat')
     count = 0
     record = nil
     reader.each do | rec |
-      count += 1
+      count += 1
       record = rec
     end
     assert_equal(1, count)
@@ -30,11 +50,6 @@ class ReaderTest < Test::Unit::TestCase
     assert_equal('1', record['LOC']['9'])
   end
-  def test_unicode_load
-    reader =   MARC::Reader.new('test/000039829.marc')
-    assert_nothing_raised { reader.first }
-  end
   def test_bad_marc
     reader = MARC::Reader.new('test/tc_reader.rb')
     assert_raises(MARC::Exception) {reader.entries[0]}

data/test/tc_reader_char_encodings.rb ADDED

@@ -0,0 +1,256 @@
+# -*- encoding: utf-8 -*-
+require 'test/unit'
+require 'marc'
+# Testing char encodings under 1.9, don't bother running
+# these tests except under 1.9, will either fail (because
+# 1.9 func the test itself uses isn't there), or trivially pass
+# (becuase the func they are testing is no-op on 1.9).
+if "".respond_to?(:encoding)
+  class ReaderCharEncodingsTest < Test::Unit::TestCase
+    ####
+    # Helper methods for our tests
+    #
+    ####
+    @@utf_marc_path = 'test/utf8.marc'
+    # tests against record at test/utf8.marc
+    def assert_utf8_right_in_utf8(record)
+      assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
+      assert_equal "UTF-8", record['245'].to_s.encoding.name
+      assert_equal "UTF-8", record['245'].subfields.first.to_s.encoding.name
+      assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
+      assert_equal "UTF-8", record['245']['a'].encoding.name
+      assert record['245']['a'].start_with?("Photčhanānukrom")
+    end
+    # Test against multirecord just to be sure that works.
+    # the multirecord file is just two concatenated copies
+    # of the single one.
+    @@cp866_marc_path = "test/cp866_multirecord.marc"
+    # assumes record in test/cp866_unimarc.marc
+    # Pass in an encoding name, using ruby's canonical name!
+    # "IBM866" not "cp866". "UTF-8".
+    def assert_cp866_right(record, encoding = "IBM866")
+      assert_equal(encoding, record['001'].value.encoding.name)
+      assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
+    end
+    ####
+    # end helper methods
+    ####
+    def test_unicode_load
+      reader = MARC::Reader.new(@@utf_marc_path)
+      record = nil
+      assert_nothing_raised { record = reader.first }
+      assert_utf8_right_in_utf8(record)
+    end
+    def test_unicode_decode_forgiving
+      # two kinds of forgiving invocation, they shouldn't be different,
+      # but just in case they have slightly different code paths, test em
+      # too.
+      marc_string = File.open(@@utf_marc_path).read.force_encoding("utf-8")
+      record = MARC::Reader.decode(marc_string, :forgiving => true)
+      assert_utf8_right_in_utf8(record)
+      reader = MARC::ForgivingReader.new(@@utf_marc_path)
+      record = reader.first
+      assert_utf8_right_in_utf8(record)
+    end
+    def test_unicode_forgiving_reader_passes_options
+      # Make sure ForgivingReader accepts same options as MARC::Reader
+      # We don't test them ALL though, just a sample.
+      # Tell it we're reading cp866, but trancode to utf8 for us.
+      reader = MARC::ForgivingReader.new(@@cp866_marc_path, :external_encoding => "cp866", :internal_encoding => "utf-8")
+      record = reader.first
+      assert_cp866_right(record, "UTF-8")
+    end
+    def test_explicit_encoding
+      reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'cp866')
+      assert_cp866_right(reader.first, "IBM866")
+    end
+    def test_bad_encoding_name_input
+      reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'adadfadf')
+      assert_raises ArgumentError do
+        reader.first
+      end
+    end
+    def test_marc8_with_binary
+      # Marc8, best we can do is read it in binary.
+      reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
+      record = reader.first
+      assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
+    end
+    def test_load_file_opened_with_external_encoding
+      reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
+      record = reader.first
+      # Make sure it's got the encoding it's supposed to.
+      assert_cp866_right(record, "IBM866")
+    end
+    def test_explicit_encoding_beats_file_encoding
+      reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:utf-8'), :external_encoding => "cp866")
+      record = reader.first
+      assert_cp866_right(record, "IBM866")
+    end
+    def test_from_string_with_utf8_encoding
+      marc_string = File.open(@@utf_marc_path).read.force_encoding("UTF-8")
+      reader = MARC::Reader.new(StringIO.new(marc_string))
+      record = reader.first
+      assert_utf8_right_in_utf8(record)
+    end
+    def test_from_string_with_cp866
+      marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
+      reader = MARC::Reader.new(StringIO.new(marc_string))
+      record = reader.first
+      assert_cp866_right(record, "IBM866")
+    end
+    def test_decode_from_string_with_cp866
+      marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
+      record = MARC::Reader.decode(marc_string)
+      assert_cp866_right(record, "IBM866")
+    end
+    def test_with_transcode
+      reader = MARC::Reader.new(@@cp866_marc_path,
+        :external_encoding => 'cp866',
+        :internal_encoding => 'UTF-8')
+      record = reader.first
+      assert_cp866_right(record, "UTF-8")
+    end
+    def test_with_binary_filehandle
+      # about to recommend this as a foolproof way to avoid
+      # ruby transcoding behind your back in docs, let's make
+      # sure it really works.
+      reader = MARC::Reader.new(File.open(@@cp866_marc_path, :external_encoding => "binary", :internal_encoding => "binary"),
+        :external_encoding => "IBM866")
+      record = reader.first
+      assert_cp866_right(record, "IBM866")
+    end
+    def test_with_bad_source_bytes
+      reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
+        :external_encoding => "UTF-8",
+        :validate_encoding => true)
+      assert_raise Encoding::InvalidByteSequenceError do
+        record = reader.first
+      end
+    end
+    def test_bad_source_bytes_with_replace
+      reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
+        :external_encoding => "UTF-8", :invalid => :replace)
+      record = nil
+      assert_nothing_raised do
+        record = reader.first
+      end
+      # it should have the unicode replacement char where the bad
+      # byte was.
+      assert_match '=> ' +  "\uFFFD" + '( <=', record['245']['a']
+    end
+    def test_bad_source_bytes_with_custom_replace
+      reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
+        :external_encoding => "UTF-8", :invalid => :replace, :replace => '')
+      record = reader.first
+      # bad byte replaced with empty string, gone.
+      assert_match '=> ( <=', record['245']['a']
+    end
+    def test_default_internal_encoding
+      # Some people WILL be changing their Encoding.default_internal
+      # It's even recommended by wycats
+      # http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
+      # This will in some cases make ruby File object trans-code
+      # by default. Trans-coding a serial marc binary can change the
+      # byte count and mess it up.
+      #
+      # But at present, because of the way the Reader is implemented reading
+      # specific bytecounts, it _works_, although it does not _respect_
+      # Encoding.default_internal. That's the best we can do right now,
+      # thsi test is important to ensure it stays at least this good.
+       begin
+         original = Encoding.default_internal
+         Encoding.default_internal = "UTF-8"
+         reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
+         record = reader.first
+         assert_cp866_right(record, "IBM866")
+       ensure
+         Encoding.default_internal = original
+       end
+    end
+    def test_default_internal_encoding_with_string_arg
+      begin
+         original = Encoding.default_internal
+         Encoding.default_internal = "UTF-8"
+         reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => "cp866")
+         record = reader.first
+         assert_cp866_right(record, "IBM866")
+       ensure
+         Encoding.default_internal = original
+       end
+    end
+  end
+else
+  require 'pathname'
+  $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
+end