RubyGems - marc - Versions diffs - 1.1.1 → 1.2.0 - Mend

marc 1.1.1 → 1.2.0

Files changed (52) hide show

checksums.yaml +4 -4
data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
data/.github/workflows/ruby.yml +24 -0
data/.gitignore +17 -0
data/.standard.yml +1 -0
data/{Changes → CHANGELOG.md} +102 -30
data/Gemfile +15 -0
data/README.md +239 -46
data/Rakefile +14 -14
data/bin/marc +14 -0
data/bin/marc2xml +17 -0
data/examples/xml2marc.rb +10 -0
data/lib/marc/constants.rb +3 -3
data/lib/marc/controlfield.rb +35 -23
data/lib/marc/datafield.rb +70 -63
data/lib/marc/dublincore.rb +59 -41
data/lib/marc/exception.rb +9 -1
data/lib/marc/jsonl_reader.rb +33 -0
data/lib/marc/jsonl_writer.rb +44 -0
data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
data/lib/marc/marc8/to_unicode.rb +80 -86
data/lib/marc/reader.rb +117 -123
data/lib/marc/record.rb +72 -62
data/lib/marc/subfield.rb +12 -10
data/lib/marc/unsafe_xmlwriter.rb +93 -0
data/lib/marc/version.rb +1 -1
data/lib/marc/writer.rb +27 -30
data/lib/marc/xml_parsers.rb +222 -197
data/lib/marc/xmlreader.rb +131 -114
data/lib/marc/xmlwriter.rb +93 -82
data/lib/marc.rb +20 -18
data/marc.gemspec +23 -0
data/test/marc8/tc_marc8_mapping.rb +3 -3
data/test/marc8/tc_to_unicode.rb +28 -32
data/test/messed_up_leader.xml +9 -0
data/test/tc_controlfield.rb +37 -34
data/test/tc_datafield.rb +65 -60
data/test/tc_dublincore.rb +9 -11
data/test/tc_hash.rb +10 -13
data/test/tc_jsonl.rb +19 -0
data/test/tc_marchash.rb +17 -21
data/test/tc_parsers.rb +108 -144
data/test/tc_reader.rb +35 -36
data/test/tc_reader_char_encodings.rb +149 -169
data/test/tc_record.rb +143 -148
data/test/tc_subfield.rb +14 -13
data/test/tc_unsafe_xml.rb +95 -0
data/test/tc_writer.rb +101 -108
data/test/tc_xml.rb +101 -94
data/test/tc_xml_error_handling.rb +7 -8
data/test/ts_marc.rb +8 -8
metadata +80 -9

data/lib/marc/xmlreader.rb CHANGED Viewed

@@ -1,16 +1,15 @@
-require File.dirname(__FILE__) + '/xml_parsers'
+require File.dirname(__FILE__) + "/xml_parsers"
 module MARC
   # the constructor which you can pass either a filename:
   #
   #   reader = MARC::XMLReader.new('/Users/edsu/marc.xml')
   #
-  # or a File object,
+  # or a File object,
   #
   #   reader = Marc::XMLReader.new(File.new('/Users/edsu/marc.xml'))
   #
   # or really any object that responds to read(n)
-  #
+  #
   #   reader = MARC::XMLReader.new(StringIO.new(xml))
   #
   # By default, XMLReader uses REXML's pull parser, but you can swap
@@ -18,7 +17,7 @@ module MARC
   # 'best' one).  The :parser can either be one of the defined constants
   # or the constant's value.
   #
-  #   reader = MARC::XMLReader.new(fh, :parser=>'magic')
+  #   reader = MARC::XMLReader.new(fh, :parser=>'magic')
   #
   # It is also possible to set the default parser at the class level so
   # all subsequent instances will use it instead:
@@ -28,151 +27,169 @@ module MARC
   #
   # Use:
   #   MARC::XMLReader.best_available!
-  #
+  #
   # or
   #   MARC::XMLReader.nokogiri!
-  #
+  #
+  # By default, all XML parsers except REXML require the MARC namespace
+  # (http://www.loc.gov/MARC21/slim) to be included. Adding the option
+  # `ignore_namespace` to the call to `new` with a true value
+  # will allow parsing to proceed,  e.g.,
+  #
+  #     reader = MARC::XMLReader.new(filename, parser: :nokogiri, ignore_namespace: true)
+  #
+  # You can also pass in an error_handler option that will be called if
+  # there are any validation errors found when parsing a record.
+  #
+  #  reader = MARC::XMLReader.new(fh, error_handler: ->(reader, record, block) { ... })
+  #
+  # By default, a MARC::RecordException is raised halting all future parsing.
   class XMLReader
     include Enumerable
-    USE_BEST_AVAILABLE = 'magic'
-    USE_REXML = 'rexml'
-    USE_NOKOGIRI = 'nokogiri'
-    USE_JREXML = 'jrexml'
-    USE_JSTAX = 'jstax'
-    USE_LIBXML = 'libxml'
+    USE_BEST_AVAILABLE = "magic"
+    USE_REXML = "rexml"
+    USE_NOKOGIRI = "nokogiri"
+    USE_JREXML = "jrexml"
+    USE_JSTAX = "jstax"
+    USE_LIBXML = "libxml"
     @@parser = USE_REXML
-    attr_reader :parser
+    attr_reader :parser, :error_handler
     def initialize(file, options = {})
       if file.is_a?(String)
         handle = File.new(file)
-      elsif file.respond_to?("read", 5)
+      elsif file.respond_to?(:read, 5)
         handle = file
       else
         raise ArgumentError, "must pass in path or File"
       end
       @handle = handle
-      if options[:parser]
-        parser = self.class.choose_parser(options[:parser].to_s)
+      if options[:ignore_namespace]
+        @ignore_namespace = options[:ignore_namespace]
+      end
+      parser = if options[:parser]
+        self.class.choose_parser(options[:parser].to_s)
       else
-        parser = @@parser
+        @@parser
       end
       case parser
-      when 'magic' then extend MagicReader
-      when 'rexml' then extend REXMLReader
-      when 'jrexml' then
+      when "magic" then extend MagicReader
+      when "rexml" then extend REXMLReader
+      when "jrexml"
         raise ArgumentError, "jrexml only available under jruby" unless defined? JRUBY_VERSION
         extend JREXMLReader
-      when 'nokogiri' then extend NokogiriReader
-      when 'jstax' then
+      when "nokogiri" then extend NokogiriReader
+      when "jstax"
         raise ArgumentError, "jstax only available under jruby" unless defined? JRUBY_VERSION
         extend JRubySTAXReader
-      when 'libxml' then extend LibXMLReader
-        raise ArgumentError, "libxml not available under jruby" if defined? JRUBY_VERSION
+      when "libxml" then extend LibXMLReader
+                         raise ArgumentError, "libxml not available under jruby" if defined? JRUBY_VERSION
       end
-    end
-    # Returns the currently set parser type
-    def self.parser
-      return @@parser
-    end
-    # Returns an array of all the parsers available
-    def self.parsers
-      p = []
-      self.constants.each do | const |
-        next unless const.match("^USE_")
-        p << const
-      end
-      return p
-    end
-    # Sets the class parser
-    def self.parser=(p)
-      @@parser = choose_parser(p)
+      @error_handler = options[:error_handler]
     end
-    # Returns the value of the best available parser
-    def self.best_available
-      parser = nil
-      jruby = [USE_NOKOGIRI, USE_JSTAX, USE_JREXML]
-      ruby = [USE_NOKOGIRI, USE_LIBXML]
-      if defined? JRUBY_VERSION
-        unless parser
-          begin
-            require 'nokogiri'
-            parser = USE_NOKOGIRI
-          rescue LoadError
-          end
+    class << self
+      # Returns the currently set parser type
+      def parser
+        @@parser
+      end
+      # Returns an array of all the parsers available
+      def parsers
+        p = []
+        constants.each do |const|
+          next unless const.match?("^USE_")
+          p << const
         end
-        unless parser
-          begin
-            # try to find the class, so we throw an error if not found
-            java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
-            parser = USE_JSTAX
-          rescue java.lang.ClassNotFoundException
+        p
+      end
+      # Sets the class parser
+      def parser=(p)
+        @@parser = choose_parser(p)
+      end
+      # Returns the value of the best available parser
+      def best_available
+        parser = nil
+        if defined? JRUBY_VERSION
+          unless parser
+            begin
+              require "nokogiri"
+              parser = USE_NOKOGIRI
+            rescue LoadError
+            end
           end
-        end
-        unless parser
-          begin
-            require 'jrexml'
-            parser = USE_JREXML
-          rescue LoadError
+          unless parser
+            begin
+              # try to find the class, so we throw an error if not found
+              java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
+              parser = USE_JSTAX
+            rescue java.lang.ClassNotFoundException
+            end
           end
-        end
-      else
-        begin
-          require 'nokogiri'
-          parser = USE_NOKOGIRI
-        rescue LoadError
-        end
-        unless defined? JRUBY_VERSION
           unless parser
             begin
-              require 'xml'
-              parser = USE_LIBXML
+              require "jrexml"
+              parser = USE_JREXML
             rescue LoadError
             end
-          end
+          end
+        else
+          begin
+            require "nokogiri"
+            parser = USE_NOKOGIRI
+          rescue LoadError
+          end
+          unless defined? JRUBY_VERSION
+            unless parser
+              begin
+                require "xml"
+                parser = USE_LIBXML
+              rescue LoadError
+              end
+            end
+          end
         end
+        parser ||= USE_REXML
+        parser
       end
-      parser = USE_REXML unless parser
-      parser
-    end
-    # Sets the best available parser as the default
-    def self.best_available!
-      @@parser = self.best_available
-    end
-    # Sets Nokogiri as the default parser
-    def self.nokogiri!
-      @@parser = USE_NOKOGIRI
-    end
-    # Sets jrexml as the default parser
-    def self.jrexml!
-      @@parser = USE_JREXML
-    end
-    # Sets REXML as the default parser
-    def self.rexml!
-      @@parser = USE_REXML
-    end
-    protected
-    def self.choose_parser(p)
-      match = false
-      self.constants.each do | const |
-        next unless const.to_s.match("^USE_")
-        if self.const_get(const) == p
-          match = true
-          return p
+      # Sets the best available parser as the default
+      def best_available!
+        @@parser = best_available
+      end
+      # Sets Nokogiri as the default parser
+      def nokogiri!
+        @@parser = USE_NOKOGIRI
+      end
+      # Sets jrexml as the default parser
+      def jrexml!
+        @@parser = USE_JREXML
+      end
+      # Sets REXML as the default parser
+      def rexml!
+        @@parser = USE_REXML
+      end
+      def choose_parser(p)
+        match = false
+        constants.each do |const|
+          next unless const.to_s.match?("^USE_")
+          if const_get(const) == p
+            match = true
+            return p
+          end
         end
+        raise ArgumentError.new("Parser '#{p}' not defined") unless match
       end
-      raise ArgumentError.new("Parser '#{p}' not defined") unless match
     end
   end
 end

data/lib/marc/xmlwriter.rb CHANGED Viewed

@@ -1,156 +1,167 @@
-require 'rexml/document'
-require 'rexml/text'
-require 'rexml/formatters/default'
+require "rexml/document"
+require "rexml/text"
+require "rexml/formatters/default"
 module MARC
   # A class for writing MARC records as MARCXML.
   # BIG CAVEAT! XMLWriter will *not* convert your MARC8 to UTF8
   # bug the authors to do this if you need it
   class XMLWriter
     # the constructor which you must pass a file path
     # or an object that responds to a write message
     # the second argument is a hash of options, currently
     # only supporting one option, stylesheet
-    #
+    #
     # writer = XMLWriter.new 'marc.xml', :stylesheet => 'style.xsl'
     # writer.write record
-    def initialize(file, opts={})
+    #
+    COLLECTION_TAG = %(<collection xmlns='#{MARC_NS}'
+      xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
+      xsi:schemaLocation="#{MARC_NS} #{MARC_XSD}">).freeze
+    def initialize(file, opts = {}, &blk)
       @writer = REXML::Formatters::Default.new
-      if file.class == String
-        @fh = File.new(file,"w")
-      elsif file.respond_to?('write')
+      if file.instance_of?(String)
+        @fh = File.new(file, "w")
+      elsif file.respond_to?(:write)
         @fh = file
       else
         raise ArgumentError, "must pass in file name or handle"
       end
+      @stylesheet = opts[:stylesheet]
       @fh.write("<?xml version='1.0'?>\n")
-      if opts[:stylesheet]
-        @fh.write(
-          %Q{<?xml-stylesheet type="text/xsl" href="#{opts[:stylesheet]}"?>\n})
-      end
-      @fh.write("<collection xmlns='" + MARC_NS + "' " +
-        "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " +
-        "xsi:schemaLocation='" + MARC_NS + " " + MARC_XSD + "'>")
+      @fh.write(stylesheet_tag)
+      @fh.write(COLLECTION_TAG)
       @fh.write("\n")
+      if block_given?
+        blk.call(self)
+        self.close
+      end
+    end
+    def stylesheet_tag
+      if @stylesheet
+        %(<?xml-stylesheet type="text/xsl" href="#{@stylesheet}"?>\n)
+      else
+        ""
+      end
     end
     # write a record to the file or handle
     def write(record)
       @writer.write(MARC::XMLWriter.encode(record), @fh)
       @fh.write("\n")
     end
     # close underlying filehandle
     def close
       @fh.write("</collection>")
       @fh.close
     end
+    def self.fix_leader(leader)
+      fixed_leader = leader.gsub(/[^\w|^\s]/, "Z")
+      # The leader must have at least 24 characters
+      fixed_leader = fixed_leader.ljust(24) if fixed_leader.length < 24
+      # MARCXML is particular about last four characters; ILSes aren't
+      if fixed_leader[20..23] != "4500"
+        fixed_leader[20..23] = "4500"
+      end
+      # MARCXML doesn't like a space here so we need a filler character: Z
+      if fixed_leader[6..6] == " "
+        fixed_leader[6..6] = "Z"
+      end
+      fixed_leader
+    end
     # a static method that accepts a MARC::Record object
     # and returns a REXML::Document for the XML serialization.
+    def self.encode(record, opts = {})
+      single_char = Regexp.new('[\da-z ]{1}')
+      subfield_char = Regexp.new('[\dA-Za-z!"#$%&\'()*+,-./:;<=>?{}_^`~\[\]\\\]{1}')
+      control_field_tag = Regexp.new("00[1-9A-Za-z]{1}")
-    def self.encode(record, opts={})
-      singleChar = Regexp.new('[\da-z ]{1}')
-      subfieldChar = Regexp.new('[\dA-Za-z!"#$%&\'()*+,-./:;<=>?{}_^`~\[\]\\\]{1}')
-      ctrlFieldTag = Regexp.new('00[1-9A-Za-z]{1}')
       # Right now, this writer handles input from the strict and
       # lenient MARC readers. Because it can get 'loose' MARC in, it
       # attempts to do some cleanup on data values that are not valid
       # MARCXML.
       # TODO? Perhaps the 'loose MARC' checks should be split out
       # into a tolerant MARCXMLWriter allowing the main one to skip
       # this extra work.
       # TODO: At the very least there should be some logging
       # to record our attempts to account for less than perfect MARC.
-      e = REXML::Element.new('record')
+      e = REXML::Element.new("record")
       e.add_namespace(MARC_NS) if opts[:include_namespace]
-      # MARCXML only allows alphanumerics or spaces in the leader
-      record.leader.gsub!(/[^\w|^\s]/, 'Z')
-      # MARCXML is particular about last four characters; ILSes aren't
-      if (record.leader[20..23] != "4500")
-        record.leader[20..23] = "4500"
-      end
+      leader_element = REXML::Element.new("leader")
+      leader_element.add_text(fix_leader(record.leader))
+      e.add_element(leader_element)
-      # MARCXML doesn't like a space here so we need a filler character: Z
-      if (record.leader[6..6] == " ")
-        record.leader[6..6] = "Z"
-      end
-      leader = REXML::Element.new("leader")
-      leader.add_text(record.leader)
-      e.add_element(leader)
       record.each do |field|
-        if field.class == MARC::DataField
+        if field.instance_of?(MARC::DataField)
           datafield_elem = REXML::Element.new("datafield")
+          ind1 = field.indicator1
           # If marc is leniently parsed, we may have some dirty data; using
           # the 'z' ind1 value should help us locate these later to fix
-          if field.indicator1.nil? || (field.indicator1.match(singleChar) == nil)
-            field.indicator1 = 'z'
-          end
+          ind1 = "z" if ind1.nil? || !ind1.match?(single_char)
+          ind2 = field.indicator2
           # If marc is leniently parsed, we may have some dirty data; using
           # the 'z' ind2 value should help us locate these later to fix
-          if field.indicator2.nil? || (field.indicator2.match(singleChar) == nil)
-            field.indicator2 = 'z'
-          end
+          ind2 = "z" if field.indicator2.nil? || !ind2.match?(single_char)
           datafield_elem.add_attributes({
-            "tag"=>field.tag,
-            "ind1"=>field.indicator1,
-            "ind2"=>field.indicator2
+            "tag" => field.tag,
+            "ind1" => ind1,
+            "ind2" => ind2
           })
-          for subfield in field.subfields
+          field.subfields.each do |subfield|
             subfield_element = REXML::Element.new("subfield")
+            code = subfield.code
             # If marc is leniently parsed, we may have some dirty data; using
             # the blank subfield code should help us locate these later to fix
-            if (subfield.code.match(subfieldChar) == nil)
-              subfield.code = ' '
-            end
-            subfield_element.add_attribute("code", subfield.code)
+            code = " " if subfield.code.match(subfield_char).nil?
+            subfield_element.add_attribute("code", code)
             text = subfield.value
             subfield_element.add_text(text)
             datafield_elem.add_element(subfield_element)
           end
           e.add_element datafield_elem
-        elsif field.class == MARC::ControlField
+        elsif field.instance_of?(MARC::ControlField)
           control_element = REXML::Element.new("controlfield")
+          tag = field.tag
           # We need a marker for invalid tag values (we use 000)
-          unless field.tag.match(ctrlFieldTag) or MARC::ControlField.control_tag?(ctrlFieldTag)
-            field.tag = "00z"
-          end
-          control_element.add_attribute("tag", field.tag)
+          tag = "00z" unless tag.match(control_field_tag) || MARC::ControlField.control_tag?(tag)
+          control_element.add_attribute("tag", tag)
           text = field.value
           control_element.add_text(text)
           e.add_element(control_element)
         end
       end
       # return xml
-      return e
+      e
     end
   end
 end

data/lib/marc.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-#marc is a ruby library for reading and writing MAchine Readable Cataloging
-#(MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
+# marc is a ruby library for reading and writing MAchine Readable Cataloging
+# (MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
 #
-#USAGE
+# USAGE
 #
 #    require 'marc'
 #
@@ -11,7 +11,7 @@
 #      puts record['245']['a']
 #    end
 #
-#    # creating a record
+#    # creating a record
 #    record = MARC::Record.new()
 #    record.add_field(MARC::DataField.new('100', '0',  ' ', ['a', 'John Doe']))
 #
@@ -30,17 +30,19 @@
 #    record = MARC::Record.new()
 #    record.add_field(MARC::ControlField.new('FMT', 'Book')) # doesn't raise an error
-require File.dirname(__FILE__) + '/marc/version'
-require File.dirname(__FILE__) + '/marc/constants'
-require File.dirname(__FILE__) + '/marc/record'
-require File.dirname(__FILE__) + '/marc/datafield'
-require File.dirname(__FILE__) + '/marc/controlfield'
-require File.dirname(__FILE__) + '/marc/subfield'
-require File.dirname(__FILE__) + '/marc/reader'
-require File.dirname(__FILE__) + '/marc/writer'
-require File.dirname(__FILE__) + '/marc/exception'
-require File.dirname(__FILE__) + '/marc/xmlwriter'
-require File.dirname(__FILE__) + '/marc/xmlreader'
-require File.dirname(__FILE__) + '/marc/dublincore'
-require File.dirname(__FILE__) + '/marc/xml_parsers'
+require_relative "marc/version"
+require_relative "marc/constants"
+require_relative "marc/record"
+require_relative "marc/datafield"
+require_relative "marc/controlfield"
+require_relative "marc/subfield"
+require_relative "marc/reader"
+require_relative "marc/writer"
+require_relative "marc/exception"
+require_relative "marc/xmlwriter"
+require_relative "marc/unsafe_xmlwriter"
+require_relative "marc/xmlreader"
+require_relative "marc/dublincore"
+require_relative "marc/xml_parsers"
+require_relative "marc/jsonl_reader"
+require_relative "marc/jsonl_writer"

data/marc.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+require File.join(File.dirname(__FILE__), "lib/marc/version")
+Gem::Specification.new do |s|
+  s.name = "marc"
+  s.version = MARC::VERSION
+  s.author = "Ed Summers"
+  s.email = "ehs@pobox.com"
+  s.homepage = "https://github.com/ruby-marc/ruby-marc/"
+  s.summary = "A ruby library for working with Machine Readable Cataloging"
+  s.license = "MIT"
+  s.required_ruby_version = ">= 1.8.6"
+  s.authors = ["Kevin Clarke", "Bill Dueber", "William Groppe", "Jonathan Rochkind", "Ross Singer", "Ed Summers", "Chris Beer"]
+  s.files = `git ls-files -z`.split("\x0")
+  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  s.test_files = s.files.grep(%r{^(test|spec|features)/})
+  s.require_paths = ["lib"]
+  s.add_development_dependency "standard", "~>1.0"
+  s.add_dependency "scrub_rb", ">= 1.0.1", "< 2" # backport for ruby 2.1 String#scrub
+  s.add_dependency "unf" # unicode normalization
+  s.add_dependency "rexml" # rexml was unbundled from the stdlib in ruby 3
+end

data/test/marc8/tc_marc8_mapping.rb CHANGED Viewed

@@ -1,6 +1,6 @@
-require 'test/unit'
-require 'marc'
-require 'marc/marc8/map_to_unicode'
+require "test/unit"
+require "marc"
+require "marc/marc8/map_to_unicode"
 class TestMarc8Mapping < Test::Unit::TestCase
   def test_codesets_just_exist