RubyGems - marc - Versions diffs - 1.0.4 → 1.2.0 - Mend

marc 1.0.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +4 -4
data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
data/.github/workflows/ruby.yml +24 -0
data/.gitignore +17 -0
data/.standard.yml +1 -0
data/{Changes → CHANGELOG.md} +106 -29
data/Gemfile +15 -0
data/README.md +240 -47
data/Rakefile +14 -14
data/bin/marc +14 -0
data/bin/marc2xml +17 -0
data/examples/xml2marc.rb +10 -0
data/lib/marc/constants.rb +3 -3
data/lib/marc/controlfield.rb +35 -23
data/lib/marc/datafield.rb +70 -63
data/lib/marc/dublincore.rb +59 -41
data/lib/marc/exception.rb +9 -1
data/lib/marc/jsonl_reader.rb +33 -0
data/lib/marc/jsonl_writer.rb +44 -0
data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
data/lib/marc/marc8/to_unicode.rb +80 -86
data/lib/marc/reader.rb +119 -121
data/lib/marc/record.rb +72 -62
data/lib/marc/subfield.rb +12 -10
data/lib/marc/unsafe_xmlwriter.rb +93 -0
data/lib/marc/version.rb +1 -1
data/lib/marc/writer.rb +27 -30
data/lib/marc/xml_parsers.rb +222 -197
data/lib/marc/xmlreader.rb +131 -114
data/lib/marc/xmlwriter.rb +93 -81
data/lib/marc.rb +20 -18
data/marc.gemspec +23 -0
data/test/marc8/tc_marc8_mapping.rb +3 -3
data/test/marc8/tc_to_unicode.rb +28 -32
data/test/messed_up_leader.xml +9 -0
data/test/tc_controlfield.rb +37 -34
data/test/tc_datafield.rb +65 -60
data/test/tc_dublincore.rb +9 -11
data/test/tc_hash.rb +10 -13
data/test/tc_jsonl.rb +19 -0
data/test/tc_marchash.rb +17 -21
data/test/tc_parsers.rb +108 -144
data/test/tc_reader.rb +35 -36
data/test/tc_reader_char_encodings.rb +149 -169
data/test/tc_record.rb +143 -148
data/test/tc_subfield.rb +14 -13
data/test/tc_unsafe_xml.rb +95 -0
data/test/tc_writer.rb +101 -108
data/test/tc_xml.rb +99 -87
data/test/tc_xml_error_handling.rb +7 -8
data/test/ts_marc.rb +8 -8
metadata +94 -9

data/lib/marc/xmlreader.rb CHANGED Viewed

@@ -1,16 +1,15 @@
-require File.dirname(__FILE__) + '/xml_parsers'
+require File.dirname(__FILE__) + "/xml_parsers"
 module MARC
   # the constructor which you can pass either a filename:
   #
   #   reader = MARC::XMLReader.new('/Users/edsu/marc.xml')
   #
-  # or a File object,
+  # or a File object,
   #
   #   reader = Marc::XMLReader.new(File.new('/Users/edsu/marc.xml'))
   #
   # or really any object that responds to read(n)
-  #
+  #
   #   reader = MARC::XMLReader.new(StringIO.new(xml))
   #
   # By default, XMLReader uses REXML's pull parser, but you can swap
@@ -18,7 +17,7 @@ module MARC
   # 'best' one).  The :parser can either be one of the defined constants
   # or the constant's value.
   #
-  #   reader = MARC::XMLReader.new(fh, :parser=>'magic')
+  #   reader = MARC::XMLReader.new(fh, :parser=>'magic')
   #
   # It is also possible to set the default parser at the class level so
   # all subsequent instances will use it instead:
@@ -28,151 +27,169 @@ module MARC
   #
   # Use:
   #   MARC::XMLReader.best_available!
-  #
+  #
   # or
   #   MARC::XMLReader.nokogiri!
-  #
+  #
+  # By default, all XML parsers except REXML require the MARC namespace
+  # (http://www.loc.gov/MARC21/slim) to be included. Adding the option
+  # `ignore_namespace` to the call to `new` with a true value
+  # will allow parsing to proceed,  e.g.,
+  #
+  #     reader = MARC::XMLReader.new(filename, parser: :nokogiri, ignore_namespace: true)
+  #
+  # You can also pass in an error_handler option that will be called if
+  # there are any validation errors found when parsing a record.
+  #
+  #  reader = MARC::XMLReader.new(fh, error_handler: ->(reader, record, block) { ... })
+  #
+  # By default, a MARC::RecordException is raised halting all future parsing.
   class XMLReader
     include Enumerable
-    USE_BEST_AVAILABLE = 'magic'
-    USE_REXML = 'rexml'
-    USE_NOKOGIRI = 'nokogiri'
-    USE_JREXML = 'jrexml'
-    USE_JSTAX = 'jstax'
-    USE_LIBXML = 'libxml'
+    USE_BEST_AVAILABLE = "magic"
+    USE_REXML = "rexml"
+    USE_NOKOGIRI = "nokogiri"
+    USE_JREXML = "jrexml"
+    USE_JSTAX = "jstax"
+    USE_LIBXML = "libxml"
     @@parser = USE_REXML
-    attr_reader :parser
+    attr_reader :parser, :error_handler
     def initialize(file, options = {})
       if file.is_a?(String)
         handle = File.new(file)
-      elsif file.respond_to?("read", 5)
+      elsif file.respond_to?(:read, 5)
         handle = file
       else
         raise ArgumentError, "must pass in path or File"
       end
       @handle = handle
-      if options[:parser]
-        parser = self.class.choose_parser(options[:parser].to_s)
+      if options[:ignore_namespace]
+        @ignore_namespace = options[:ignore_namespace]
+      end
+      parser = if options[:parser]
+        self.class.choose_parser(options[:parser].to_s)
       else
-        parser = @@parser
+        @@parser
       end
       case parser
-      when 'magic' then extend MagicReader
-      when 'rexml' then extend REXMLReader
-      when 'jrexml' then
+      when "magic" then extend MagicReader
+      when "rexml" then extend REXMLReader
+      when "jrexml"
         raise ArgumentError, "jrexml only available under jruby" unless defined? JRUBY_VERSION
         extend JREXMLReader
-      when 'nokogiri' then extend NokogiriReader
-      when 'jstax' then
+      when "nokogiri" then extend NokogiriReader
+      when "jstax"
         raise ArgumentError, "jstax only available under jruby" unless defined? JRUBY_VERSION
         extend JRubySTAXReader
-      when 'libxml' then extend LibXMLReader
-        raise ArgumentError, "libxml not available under jruby" if defined? JRUBY_VERSION
+      when "libxml" then extend LibXMLReader
+                         raise ArgumentError, "libxml not available under jruby" if defined? JRUBY_VERSION
       end
-    end
-    # Returns the currently set parser type
-    def self.parser
-      return @@parser
-    end
-    # Returns an array of all the parsers available
-    def self.parsers
-      p = []
-      self.constants.each do | const |
-        next unless const.match("^USE_")
-        p << const
-      end
-      return p
-    end
-    # Sets the class parser
-    def self.parser=(p)
-      @@parser = choose_parser(p)
+      @error_handler = options[:error_handler]
     end
-    # Returns the value of the best available parser
-    def self.best_available
-      parser = nil
-      jruby = [USE_NOKOGIRI, USE_JSTAX, USE_JREXML]
-      ruby = [USE_NOKOGIRI, USE_LIBXML]
-      if defined? JRUBY_VERSION
-        unless parser
-          begin
-            require 'nokogiri'
-            parser = USE_NOKOGIRI
-          rescue LoadError
-          end
+    class << self
+      # Returns the currently set parser type
+      def parser
+        @@parser
+      end
+      # Returns an array of all the parsers available
+      def parsers
+        p = []
+        constants.each do |const|
+          next unless const.match?("^USE_")
+          p << const
         end
-        unless parser
-          begin
-            # try to find the class, so we throw an error if not found
-            java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
-            parser = USE_JSTAX
-          rescue java.lang.ClassNotFoundException
+        p
+      end
+      # Sets the class parser
+      def parser=(p)
+        @@parser = choose_parser(p)
+      end
+      # Returns the value of the best available parser
+      def best_available
+        parser = nil
+        if defined? JRUBY_VERSION
+          unless parser
+            begin
+              require "nokogiri"
+              parser = USE_NOKOGIRI
+            rescue LoadError
+            end
           end
-        end
-        unless parser
-          begin
-            require 'jrexml'
-            parser = USE_JREXML
-          rescue LoadError
+          unless parser
+            begin
+              # try to find the class, so we throw an error if not found
+              java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
+              parser = USE_JSTAX
+            rescue java.lang.ClassNotFoundException
+            end
           end
-        end
-      else
-        begin
-          require 'nokogiri'
-          parser = USE_NOKOGIRI
-        rescue LoadError
-        end
-        unless defined? JRUBY_VERSION
           unless parser
             begin
-              require 'xml'
-              parser = USE_LIBXML
+              require "jrexml"
+              parser = USE_JREXML
             rescue LoadError
             end
-          end
+          end
+        else
+          begin
+            require "nokogiri"
+            parser = USE_NOKOGIRI
+          rescue LoadError
+          end
+          unless defined? JRUBY_VERSION
+            unless parser
+              begin
+                require "xml"
+                parser = USE_LIBXML
+              rescue LoadError
+              end
+            end
+          end
         end
+        parser ||= USE_REXML
+        parser
       end
-      parser = USE_REXML unless parser
-      parser
-    end
-    # Sets the best available parser as the default
-    def self.best_available!
-      @@parser = self.best_available
-    end
-    # Sets Nokogiri as the default parser
-    def self.nokogiri!
-      @@parser = USE_NOKOGIRI
-    end
-    # Sets jrexml as the default parser
-    def self.jrexml!
-      @@parser = USE_JREXML
-    end
-    # Sets REXML as the default parser
-    def self.rexml!
-      @@parser = USE_REXML
-    end
-    protected
-    def self.choose_parser(p)
-      match = false
-      self.constants.each do | const |
-        next unless const.to_s.match("^USE_")
-        if self.const_get(const) == p
-          match = true
-          return p
+      # Sets the best available parser as the default
+      def best_available!
+        @@parser = best_available
+      end
+      # Sets Nokogiri as the default parser
+      def nokogiri!
+        @@parser = USE_NOKOGIRI
+      end
+      # Sets jrexml as the default parser
+      def jrexml!
+        @@parser = USE_JREXML
+      end
+      # Sets REXML as the default parser
+      def rexml!
+        @@parser = USE_REXML
+      end
+      def choose_parser(p)
+        match = false
+        constants.each do |const|
+          next unless const.to_s.match?("^USE_")
+          if const_get(const) == p
+            match = true
+            return p
+          end
         end
+        raise ArgumentError.new("Parser '#{p}' not defined") unless match
       end
-      raise ArgumentError.new("Parser '#{p}' not defined") unless match
     end
   end
 end

data/lib/marc/xmlwriter.rb CHANGED Viewed

@@ -1,155 +1,167 @@
-require 'rexml/document'
-require 'rexml/text'
-require 'rexml/formatters/default'
+require "rexml/document"
+require "rexml/text"
+require "rexml/formatters/default"
 module MARC
   # A class for writing MARC records as MARCXML.
   # BIG CAVEAT! XMLWriter will *not* convert your MARC8 to UTF8
   # bug the authors to do this if you need it
   class XMLWriter
     # the constructor which you must pass a file path
     # or an object that responds to a write message
     # the second argument is a hash of options, currently
     # only supporting one option, stylesheet
-    #
+    #
     # writer = XMLWriter.new 'marc.xml', :stylesheet => 'style.xsl'
     # writer.write record
-    def initialize(file, opts={})
+    #
+    COLLECTION_TAG = %(<collection xmlns='#{MARC_NS}'
+      xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
+      xsi:schemaLocation="#{MARC_NS} #{MARC_XSD}">).freeze
+    def initialize(file, opts = {}, &blk)
       @writer = REXML::Formatters::Default.new
-      if file.class == String
-        @fh = File.new(file,"w")
-      elsif file.respond_to?('write')
+      if file.instance_of?(String)
+        @fh = File.new(file, "w")
+      elsif file.respond_to?(:write)
         @fh = file
       else
         raise ArgumentError, "must pass in file name or handle"
       end
+      @stylesheet = opts[:stylesheet]
       @fh.write("<?xml version='1.0'?>\n")
-      if opts[:stylesheet]
-        @fh.write(
-          %Q{<?xml-stylesheet type="text/xsl" href="#{opts[:stylesheet]}"?>\n})
-      end
-      @fh.write("<collection xmlns='" + MARC_NS + "' " +
-        "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " +
-        "xsi:schemaLocation='" + MARC_NS + " " + MARC_XSD + "'>")
+      @fh.write(stylesheet_tag)
+      @fh.write(COLLECTION_TAG)
       @fh.write("\n")
+      if block_given?
+        blk.call(self)
+        self.close
+      end
+    end
+    def stylesheet_tag
+      if @stylesheet
+        %(<?xml-stylesheet type="text/xsl" href="#{@stylesheet}"?>\n)
+      else
+        ""
+      end
     end
     # write a record to the file or handle
     def write(record)
       @writer.write(MARC::XMLWriter.encode(record), @fh)
       @fh.write("\n")
     end
     # close underlying filehandle
     def close
       @fh.write("</collection>")
       @fh.close
     end
+    def self.fix_leader(leader)
+      fixed_leader = leader.gsub(/[^\w|^\s]/, "Z")
+      # The leader must have at least 24 characters
+      fixed_leader = fixed_leader.ljust(24) if fixed_leader.length < 24
+      # MARCXML is particular about last four characters; ILSes aren't
+      if fixed_leader[20..23] != "4500"
+        fixed_leader[20..23] = "4500"
+      end
+      # MARCXML doesn't like a space here so we need a filler character: Z
+      if fixed_leader[6..6] == " "
+        fixed_leader[6..6] = "Z"
+      end
+      fixed_leader
+    end
     # a static method that accepts a MARC::Record object
     # and returns a REXML::Document for the XML serialization.
+    def self.encode(record, opts = {})
+      single_char = Regexp.new('[\da-z ]{1}')
+      subfield_char = Regexp.new('[\dA-Za-z!"#$%&\'()*+,-./:;<=>?{}_^`~\[\]\\\]{1}')
+      control_field_tag = Regexp.new("00[1-9A-Za-z]{1}")
-    def self.encode(record, opts={})
-      singleChar = Regexp.new('[\da-z ]{1}')
-      ctrlFieldTag = Regexp.new('00[1-9A-Za-z]{1}')
       # Right now, this writer handles input from the strict and
       # lenient MARC readers. Because it can get 'loose' MARC in, it
       # attempts to do some cleanup on data values that are not valid
       # MARCXML.
       # TODO? Perhaps the 'loose MARC' checks should be split out
       # into a tolerant MARCXMLWriter allowing the main one to skip
       # this extra work.
       # TODO: At the very least there should be some logging
       # to record our attempts to account for less than perfect MARC.
-      e = REXML::Element.new('record')
+      e = REXML::Element.new("record")
       e.add_namespace(MARC_NS) if opts[:include_namespace]
-      # MARCXML only allows alphanumerics or spaces in the leader
-      record.leader.gsub!(/[^\w|^\s]/, 'Z')
-      # MARCXML is particular about last four characters; ILSes aren't
-      if (record.leader[20..23] != "4500")
-        record.leader[20..23] = "4500"
-      end
+      leader_element = REXML::Element.new("leader")
+      leader_element.add_text(fix_leader(record.leader))
+      e.add_element(leader_element)
-      # MARCXML doesn't like a space here so we need a filler character: Z
-      if (record.leader[6..6] == " ")
-        record.leader[6..6] = "Z"
-      end
-      leader = REXML::Element.new("leader")
-      leader.add_text(record.leader)
-      e.add_element(leader)
       record.each do |field|
-        if field.class == MARC::DataField
+        if field.instance_of?(MARC::DataField)
           datafield_elem = REXML::Element.new("datafield")
+          ind1 = field.indicator1
           # If marc is leniently parsed, we may have some dirty data; using
           # the 'z' ind1 value should help us locate these later to fix
-          if field.indicator1.nil? || (field.indicator1.match(singleChar) == nil)
-            field.indicator1 = 'z'
-          end
+          ind1 = "z" if ind1.nil? || !ind1.match?(single_char)
+          ind2 = field.indicator2
           # If marc is leniently parsed, we may have some dirty data; using
           # the 'z' ind2 value should help us locate these later to fix
-          if field.indicator2.nil? || (field.indicator2.match(singleChar) == nil)
-            field.indicator2 = 'z'
-          end
+          ind2 = "z" if field.indicator2.nil? || !ind2.match?(single_char)
           datafield_elem.add_attributes({
-            "tag"=>field.tag,
-            "ind1"=>field.indicator1,
-            "ind2"=>field.indicator2
+            "tag" => field.tag,
+            "ind1" => ind1,
+            "ind2" => ind2
           })
-          for subfield in field.subfields
+          field.subfields.each do |subfield|
             subfield_element = REXML::Element.new("subfield")
+            code = subfield.code
             # If marc is leniently parsed, we may have some dirty data; using
             # the blank subfield code should help us locate these later to fix
-            if (subfield.code.match(singleChar) == nil)
-              subfield.code = ' '
-            end
-            subfield_element.add_attribute("code", subfield.code)
+            code = " " if subfield.code.match(subfield_char).nil?
+            subfield_element.add_attribute("code", code)
             text = subfield.value
             subfield_element.add_text(text)
             datafield_elem.add_element(subfield_element)
           end
           e.add_element datafield_elem
-        elsif field.class == MARC::ControlField
+        elsif field.instance_of?(MARC::ControlField)
           control_element = REXML::Element.new("controlfield")
+          tag = field.tag
           # We need a marker for invalid tag values (we use 000)
-          unless field.tag.match(ctrlFieldTag) or MARC::ControlField.control_tag?(ctrlFieldTag)
-            field.tag = "00z"
-          end
-          control_element.add_attribute("tag", field.tag)
+          tag = "00z" unless tag.match(control_field_tag) || MARC::ControlField.control_tag?(tag)
+          control_element.add_attribute("tag", tag)
           text = field.value
           control_element.add_text(text)
           e.add_element(control_element)
         end
       end
       # return xml
-      return e
+      e
     end
   end
 end

data/lib/marc.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-#marc is a ruby library for reading and writing MAchine Readable Cataloging
-#(MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
+# marc is a ruby library for reading and writing MAchine Readable Cataloging
+# (MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
 #
-#USAGE
+# USAGE
 #
 #    require 'marc'
 #
@@ -11,7 +11,7 @@
 #      puts record['245']['a']
 #    end
 #
-#    # creating a record
+#    # creating a record
 #    record = MARC::Record.new()
 #    record.add_field(MARC::DataField.new('100', '0',  ' ', ['a', 'John Doe']))
 #
@@ -30,17 +30,19 @@
 #    record = MARC::Record.new()
 #    record.add_field(MARC::ControlField.new('FMT', 'Book')) # doesn't raise an error
-require File.dirname(__FILE__) + '/marc/version'
-require File.dirname(__FILE__) + '/marc/constants'
-require File.dirname(__FILE__) + '/marc/record'
-require File.dirname(__FILE__) + '/marc/datafield'
-require File.dirname(__FILE__) + '/marc/controlfield'
-require File.dirname(__FILE__) + '/marc/subfield'
-require File.dirname(__FILE__) + '/marc/reader'
-require File.dirname(__FILE__) + '/marc/writer'
-require File.dirname(__FILE__) + '/marc/exception'
-require File.dirname(__FILE__) + '/marc/xmlwriter'
-require File.dirname(__FILE__) + '/marc/xmlreader'
-require File.dirname(__FILE__) + '/marc/dublincore'
-require File.dirname(__FILE__) + '/marc/xml_parsers'
+require_relative "marc/version"
+require_relative "marc/constants"
+require_relative "marc/record"
+require_relative "marc/datafield"
+require_relative "marc/controlfield"
+require_relative "marc/subfield"
+require_relative "marc/reader"
+require_relative "marc/writer"
+require_relative "marc/exception"
+require_relative "marc/xmlwriter"
+require_relative "marc/unsafe_xmlwriter"
+require_relative "marc/xmlreader"
+require_relative "marc/dublincore"
+require_relative "marc/xml_parsers"
+require_relative "marc/jsonl_reader"
+require_relative "marc/jsonl_writer"

data/marc.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+require File.join(File.dirname(__FILE__), "lib/marc/version")
+Gem::Specification.new do |s|
+  s.name = "marc"
+  s.version = MARC::VERSION
+  s.author = "Ed Summers"
+  s.email = "ehs@pobox.com"
+  s.homepage = "https://github.com/ruby-marc/ruby-marc/"
+  s.summary = "A ruby library for working with Machine Readable Cataloging"
+  s.license = "MIT"
+  s.required_ruby_version = ">= 1.8.6"
+  s.authors = ["Kevin Clarke", "Bill Dueber", "William Groppe", "Jonathan Rochkind", "Ross Singer", "Ed Summers", "Chris Beer"]
+  s.files = `git ls-files -z`.split("\x0")
+  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  s.require_paths = ["lib"]
+  s.add_development_dependency "standard", "~>1.0"
+  s.add_dependency "scrub_rb", ">= 1.0.1", "< 2" # backport for ruby 2.1 String#scrub
+  s.add_dependency "unf" # unicode normalization
+  s.add_dependency "rexml" # rexml was unbundled from the stdlib in ruby 3
+end

data/test/marc8/tc_marc8_mapping.rb CHANGED Viewed

@@ -1,6 +1,6 @@
-require 'test/unit'
-require 'marc'
-require 'marc/marc8/map_to_unicode'
+require "test/unit"
+require "marc"
+require "marc/marc8/map_to_unicode"
 class TestMarc8Mapping < Test::Unit::TestCase
   def test_codesets_just_exist