RubyGems - marc - Versions diffs - 0.7.1 → 0.8.0 - Mend

marc 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/README.md +7 -18
data/Rakefile +3 -4
data/lib/marc/marc8/map_to_unicode.rb +16458 -0
data/lib/marc/marc8/to_unicode.rb +198 -0
data/lib/marc/reader.rb +133 -112
data/lib/marc/version.rb +1 -1
data/test/bad_eacc_encoding.marc8.marc +1 -0
data/test/bib178448.okay.human +24 -0
data/test/bib178448.okay.marc +1 -0
data/test/bib178448.writtenout.marc +1 -0
data/test/escaped_character_reference.marc8.marc +1 -0
data/test/marc8/data/test_marc8.txt +1514 -0
data/test/marc8/data/test_utf8.txt +1514 -0
data/test/marc8/tc_marc8_mapping.rb +11 -0
data/test/marc8/tc_to_unicode.rb +154 -0
data/test/marc_with_bad_utf8.utf8.human +40 -0
data/test/marc_with_bad_utf8.utf8.marc +1 -0
data/test/tc_reader_char_encodings.rb +92 -5
metadata +61 -15
data/test/tc_weird_jruby_bytes.rb +0 -62

data/test/marc8/tc_marc8_mapping.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'test/unit'
+require 'marc'
+require 'marc/marc8/map_to_unicode'
+class TestMarc8Mapping < Test::Unit::TestCase
+  def test_codesets_just_exist
+    assert MARC::Marc8::MapToUnicode::CODESETS
+    assert MARC::Marc8::MapToUnicode::CODESETS[0x34]
+    assert MARC::Marc8::MapToUnicode::CODESETS[0x34][0xa1]
+  end
+end

data/test/marc8/tc_to_unicode.rb ADDED Viewed

@@ -0,0 +1,154 @@
+# encoding: UTF-8
+require 'test/unit'
+require 'marc'
+require 'marc/marc8/to_unicode'
+require 'unf'
+if "".respond_to?(:encoding)
+  class TestMarc8ToUnicode < Test::Unit::TestCase
+    def test_empty_string
+      value = MARC::Marc8::ToUnicode.new.transcode("")
+      assert_equal "UTF-8", value.encoding.name
+      assert_equal "", value
+      value = MARC::Marc8::ToUnicode.new.transcode(nil)
+      assert_equal "UTF-8", value.encoding.name
+      assert_equal "", value
+    end
+    def test_one_example_marc8
+      value = MARC::Marc8::ToUnicode.new.transcode("Conversa\xF0c\xE4ao")
+      assert_equal "UTF-8", value.encoding.name
+      expected = UNF::Normalizer.normalize("Conversação", :nfc)
+      assert_equal expected, value
+    end
+    def test_lots_of_marc8_test_cases
+      # Heap of test cases taken from pymarc, which provided these
+      # two data files, marc8 and utf8, with line-by-line correspondences.
+      #
+      # For now, we have NOT included proprietary III encodings in our test data!
+      utf8_file   = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
+      marc8_file  = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
+      i = 0
+      converter = MARC::Marc8::ToUnicode.new
+      begin
+        while true do
+          i += 1
+          utf8      = utf8_file.readline.chomp
+          marc8     = marc8_file.readline.chomp
+          converted = converter.transcode(marc8)
+          assert_equal "UTF-8", converted.encoding.name, "Converted data line #{i} is tagged UTF-8"
+          assert converted.valid_encoding?, "Converted data line #{i} is valid_encoding"
+          assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
+        end
+      rescue EOFError => each
+        # just means the file was over, no biggie
+        assert i > 1500, "Read as many lines as we expected to, at least 1500"
+      rescue Exception => e
+        $stderr.puts "Error at test data line #{i}"
+        raise e
+      end
+    end
+    def test_explicit_normalization
+      # \xC1 is Marc8 "script small letter l", which under unicode
+      # COMPAT normalization will turn into ordinary 'l'
+      marc8     = "Conversa\xF0c\xE4ao \xC1"
+      unicode   = "Conversação \u2113"
+      unicode_c   = UNF::Normalizer.normalize(unicode, :nfc)
+      unicode_kc  = UNF::Normalizer.normalize(unicode, :nfkc)
+      unicode_d  = UNF::Normalizer.normalize(unicode, :nfd)
+      unicode_kd  = UNF::Normalizer.normalize(unicode, :nfkd)
+      converter = MARC::Marc8::ToUnicode.new
+      assert_equal unicode_c,  converter.transcode(marc8, :normalization => :nfc)
+      assert_equal unicode_kc, converter.transcode(marc8, :normalization => :nfkc)
+      assert_equal unicode_d,  converter.transcode(marc8, :normalization => :nfd)
+      assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
+      # disable normalization for performance or something, we won't end up with NFC.
+      refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
+    end
+    def test_expand_ncr
+      converter = MARC::Marc8::ToUnicode.new
+      marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
+      assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
+      assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
+    end
+    def test_bad_byte
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      assert_raise(Encoding::InvalidByteSequenceError) {
+        value = converter.transcode(bad_marc8)
+      }
+    end
+    def test_bad_byte_with_replacement
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      value = converter.transcode(bad_marc8, :invalid => :replace)
+      assert_equal "UTF-8", value.encoding.name
+      assert value.valid_encoding?
+      assert value.include?("\uFFFD"), "includes replacement char"
+      # coalescing multiple replacement chars at end, could change
+      # to not do so, important thing is at least one is there.
+      assert_equal "米国の統治の仕組�", value
+    end
+    def test_bad_byte_with_specified_empty_replacement
+      converter = MARC::Marc8::ToUnicode.new
+      bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
+      value = converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
+      assert_equal "UTF-8", value.encoding.name
+      assert value.valid_encoding?
+      assert_equal "米国の統治の仕組", value
+    end
+    def test_bad_escape
+      converter = MARC::Marc8::ToUnicode.new
+      # I do not understand what's going on here, or why this is
+      # desired/expected behavior.  But this
+      # test is copied from pymarc , adapted to be straight data not marc record
+      # https://github.com/edsu/pymarc/blob/master/test/marc8.py?source=cc#L34
+      bad_escape_data = "La Soci\xE2et\e,"
+      value = converter.transcode(bad_escape_data)
+      assert_equal "UTF-8", value.encoding.name
+      assert value.valid_encoding?, "Valid encoding"
+      assert_equal "La Soci\u00E9t\x1B,", value
+    end
+  end
+else
+  require 'pathname'
+  $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
+end

data/test/marc_with_bad_utf8.utf8.human ADDED Viewed

@@ -0,0 +1,40 @@
+01161cam a2200289   4500
+001 178448
+008       s1996    xx                  spa d
+035    $a X!b
+049    $a JHWV [AV] [NIRC] $n o
+096    $a WY 20.5 VC6 1996
+110 2  $a National Institutes of Health (U.S.)
+110 2  $a National Institute of Nursing Research (U.S.)#
+(No separator at end of field length=51)
+(Bad indicator data. Skipping 2 bytes)
+110 2  $a Department of Health & Human Services (U.S.)
+(Bad indicator data. Skipping 2 bytes)
+245  0 $a Ten years at NIH : $b advancing health through science : the human dimension / $c Patricia A. Grady, Harold Varmus.
+(Bad indicator data. Skipping 2 bytes)
+246    $a 10 years at NIH
+(Bad indicator data. Skipping 2 bytes)
+300    $a 2 videocassettes (229 min.) : $b sd., col. ; $c 1/2 in.
+(Bad indicator data. Skipping 2 bytes)
+520    $a A series of speakers recounts advances in nursing research from 1986 to 1996.
+(Bad indicator data. Skipping 2 bytes)
+538    $a VHS.
+(Bad indicator data. Skipping 2 bytes)
+650  2 $a Nursing Care $x videocassettes
+(Bad indicator data. Skipping 2 bytes)
+650  2 $a Nursing Research $x videocassettes
+(Bad indicator data. Skipping 2 bytes)
+650  2 $a Nursing $x videocassettes
+(Bad indicator data. Skipping 2 bytes)
+700 10 $a Grady, Patricia Anne, $d 1943-
+(Bad indicator data. Skipping 2 bytes)
+700 1  $a Varmus, Harold
+(Bad indicator data. Skipping 2 bytes)
+910    $a 178448 $b Horizon bib#
+(Bad indicator data. Skipping 2 bytes)
+949 31 $7 1 $5 WY 20.5 VC6 1996 $0 26 $0 G $2 A $8 5 $4 1
+(Bad indicator data. Skipping 2 bytes)
+991    $a WY 20.5 VC6 1996 $f nlm $b wnlm $c c. 1 $q 0 $i 3199765 $l wempbk $m elsc
+(Bad indicator data. Skipping 2 bytes)
+991    $a WY 20.5 VC6 1996 $f nlm $b wnlm $c c. 1 $q 0 $i 3199766 $l wempbk $m elsc

data/test/marc_with_bad_utf8.utf8.marc ADDED Viewed

@@ -0,0 +1 @@

+ 01161cam a2200289 4500001000700000008004100007035000800048049002400056096002100080110004100101110005100142110004900193245011600242246002000358300005600378520008500434538000900519650003300528650003700561650002800598700003300626700001900659910002500678949004000703991006400743991006400807178448 s1996 xx spa d aX!b aJHWV [AV] [NIRC]no aWY 20.5 VC6 19962 aNational Institutes of Health (U.S.)2 aNational Institute of Nursing Research (U.S.)#7;2 aDepartment of Health & Human Services (U.S.) 0aTen years at NIH :badvancing health through science : the human dimension /cPatricia A. Grady, Harold Varmus. a10 years at NIH a2 videocassettes (229 min.) :bsd., col. ;c1/2 in. aA series of speakers recounts advances in nursing research from 1986 to 1996. aVHS. 2aNursing Carexvideocassettes 2aNursing Researchxvideocassettes 2aNursingxvideocassettes10aGrady, Patricia Anne,d1943-1 aVarmus, Harold a178448bHorizon bib#31715WY 20.5 VC6 19960260G2A8541 aWY 20.5 VC6 1996fnlmbwnlmcc. 1q0i3199765lwempbkmelsc aWY 20.5 VC6 1996fnlmbwnlmcc. 1q0i3199766lwempbkmelsc

data/test/tc_reader_char_encodings.rb CHANGED Viewed

@@ -44,7 +44,24 @@ if "".respond_to?(:encoding)
       assert_equal(encoding, record['001'].value.encoding.name)
       assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
     end
+    @@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
+    def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
+      record.fields.each do |field|
+        if field.kind_of? MARC::DataField
+          field.subfields.each do |sf|
+            assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
+            assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
+          end
+        else
+          assert_equal encoding_name, field.value.encoding.name, "Is tagged #{encoding_name}: #{field}"
+          assert field.value.valid_encoding?, "Is valid encoding: #{field}"
+        end
+      end
+    end
     ####
     # end helper methods
     ####
@@ -100,13 +117,57 @@ if "".respond_to?(:encoding)
     end
     def test_marc8_with_binary
-      # Marc8, best we can do is read it in binary.
+      # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
       reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
       record = reader.first
       assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
     end
+    def test_marc8_converted_to_unicode
+      reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'MARC-8')
+      record = reader.first
+      assert_all_values_valid_encoding(record)
+      assert_equal "Serreau, Geneviève.", record['100']['a']
+    end
+    def test_marc8_converted_to_unicode_with_file_handle
+      # had some trouble with this one, let's ensure it with a test
+      file    = File.new('test/marc8_accented_chars.marc')
+      reader  = MARC::Reader.new(file, :external_encoding => "MARC-8")
+      record  =  reader.first
+      assert_all_values_valid_encoding(record)
+    end
+    def test_marc8_with_char_entity
+      reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :external_encoding => "MARC-8")
+      record = reader.first
+      assert_all_values_valid_encoding(record)
+      assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
+    end
+    def test_bad_marc8_raises
+      assert_raise(Encoding::InvalidByteSequenceError) do
+        reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8')
+        record = reader.first
+      end
+    end
+    def test_bad_marc8_with_replacement
+      reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8', :invalid => :replace, :replace => "[?]")
+      record = reader.first
+      assert_all_values_valid_encoding(record)
+      assert record['880']['a'].include?("[?]"), "includes specified replacement string"
+    end
     def test_load_file_opened_with_external_encoding
       reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
@@ -125,12 +186,38 @@ if "".respond_to?(:encoding)
     end
     def test_from_string_with_utf8_encoding
-      marc_string = File.open(@@utf_marc_path).read.force_encoding("UTF-8")
+      marc_file = File.open(@@utf_marc_path)
-      reader = MARC::Reader.new(StringIO.new(marc_string))
+      reader = MARC::Reader.new(marc_file)
       record = reader.first
-      assert_utf8_right_in_utf8(record)
+    end
+    # Something that was failing in my client Blacklight code,
+    # bad bytes should be handled appropriately
+    def test_from_string_utf8_with_bad_byte
+      marc_file = File.open('test/marc_with_bad_utf8.utf8.marc')
+      reader = MARC::Reader.new(marc_file, :invalid => :replace)
+      record = reader.first
+      record.fields.each do |field|
+        if field.kind_of? MARC::ControlField
+          assert_equal "UTF-8", field.value.encoding.name
+          assert field.value.valid_encoding?
+        else
+          field.subfields.each do |subfield|
+            assert_equal "UTF-8", subfield.value.encoding.name
+            assert subfield.value.valid_encoding?, "value has valid encoding"
+          end
+        end
+      end
+      assert record['520']['a'].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
     end
     def test_from_string_with_cp866

metadata CHANGED Viewed

@@ -1,32 +1,66 @@
 --- !ruby/object:Gem::Specification
 name: marc
 version: !ruby/object:Gem::Version
-  prerelease:
-  version: 0.7.1
+  version: 0.8.0
+  prerelease:
 platform: ruby
 authors:
 - Kevin Clarke
 - Bill Dueber
 - William Groppe
+- Jonathan Rochkind
 - Ross Singer
 - Ed Summers
 autorequire: marc
 bindir: bin
 cert_chain: []
-date: 2013-09-09 00:00:00.000000000 Z
-dependencies: []
-description:
+date: 2013-11-20 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: ensure_valid_encoding
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: unf
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description:
 email: ehs@pobox.com
 executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- lib/marc.rb
 - lib/marc/constants.rb
 - lib/marc/controlfield.rb
 - lib/marc/datafield.rb
 - lib/marc/dublincore.rb
 - lib/marc/exception.rb
+- lib/marc/marc8/map_to_unicode.rb
+- lib/marc/marc8/to_unicode.rb
 - lib/marc/reader.rb
 - lib/marc/record.rb
 - lib/marc/subfield.rb
@@ -36,12 +70,24 @@ files:
 - lib/marc/xml_parsers.rb
 - lib/marc/xmlreader.rb
 - lib/marc/xmlwriter.rb
+- lib/marc.rb
+- test/bad_eacc_encoding.marc8.marc
 - test/batch.dat
 - test/batch.xml
+- test/bib178448.okay.human
+- test/bib178448.okay.marc
+- test/bib178448.writtenout.marc
 - test/cp866_multirecord.marc
 - test/cp866_unimarc.marc
+- test/escaped_character_reference.marc8.marc
 - test/hebrew880s.marc
+- test/marc8/data/test_marc8.txt
+- test/marc8/data/test_utf8.txt
+- test/marc8/tc_marc8_mapping.rb
+- test/marc8/tc_to_unicode.rb
 - test/marc8_accented_chars.marc
+- test/marc_with_bad_utf8.utf8.human
+- test/marc_with_bad_utf8.utf8.marc
 - test/no-leading-zero.xml
 - test/non-numeric.dat
 - test/non-numeric.xml
@@ -59,7 +105,6 @@ files:
 - test/tc_reader_char_encodings.rb
 - test/tc_record.rb
 - test/tc_subfield.rb
-- test/tc_weird_jruby_bytes.rb
 - test/tc_writer.rb
 - test/tc_xml.rb
 - test/ts_marc.rb
@@ -73,27 +118,28 @@ files:
 homepage: https://github.com/ruby-marc/ruby-marc/
 licenses:
 - MIT
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: 1.8.6
-  none: false
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
-  none: false
 requirements: []
-rubyforge_project:
-rubygems_version: 1.8.24
-signing_key:
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
 specification_version: 3
 summary: A ruby library for working with Machine Readable Cataloging
 test_files:
 - test/ts_marc.rb
+has_rdoc: true