RubyGems - marc - Versions diffs - 1.1.1 → 1.3.0 - Mend

marc 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

checksums.yaml +4 -4
data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
data/.github/workflows/ruby.yml +24 -0
data/.gitignore +17 -0
data/.standard.yml +1 -0
data/{Changes → CHANGELOG.md} +116 -30
data/Gemfile +5 -0
data/README.md +239 -46
data/Rakefile +14 -14
data/bin/marc +14 -0
data/bin/marc2xml +17 -0
data/examples/xml2marc.rb +10 -0
data/lib/marc/constants.rb +3 -3
data/lib/marc/controlfield.rb +35 -23
data/lib/marc/datafield.rb +70 -63
data/lib/marc/dublincore.rb +59 -41
data/lib/marc/exception.rb +9 -1
data/lib/marc/jsonl_reader.rb +33 -0
data/lib/marc/jsonl_writer.rb +44 -0
data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
data/lib/marc/marc8/to_unicode.rb +80 -87
data/lib/marc/reader.rb +116 -124
data/lib/marc/record.rb +72 -62
data/lib/marc/subfield.rb +12 -10
data/lib/marc/unsafe_xmlwriter.rb +93 -0
data/lib/marc/version.rb +1 -1
data/lib/marc/writer.rb +27 -30
data/lib/marc/xml_parsers.rb +222 -197
data/lib/marc/xmlreader.rb +131 -114
data/lib/marc/xmlwriter.rb +93 -82
data/lib/marc.rb +20 -18
data/marc.gemspec +28 -0
data/test/marc8/tc_marc8_mapping.rb +3 -3
data/test/marc8/tc_to_unicode.rb +28 -34
data/test/messed_up_leader.xml +9 -0
data/test/tc_controlfield.rb +37 -34
data/test/tc_datafield.rb +65 -60
data/test/tc_dublincore.rb +9 -11
data/test/tc_hash.rb +10 -13
data/test/tc_jsonl.rb +19 -0
data/test/tc_marchash.rb +17 -21
data/test/tc_parsers.rb +108 -144
data/test/tc_reader.rb +35 -36
data/test/tc_reader_char_encodings.rb +149 -169
data/test/tc_record.rb +143 -148
data/test/tc_subfield.rb +14 -13
data/test/tc_unsafe_xml.rb +95 -0
data/test/tc_writer.rb +101 -108
data/test/tc_xml.rb +101 -94
data/test/tc_xml_error_handling.rb +7 -8
data/test/ts_marc.rb +8 -8
metadata +129 -22

data/lib/marc/marc8/to_unicode.rb CHANGED Viewed

@@ -1,8 +1,5 @@
-# encoding: UTF-8
-require 'marc'
-require 'marc/marc8/map_to_unicode'
-require 'unf/normalizer'
+require "marc"
+require "marc/marc8/map_to_unicode"
 module MARC
   module Marc8
@@ -24,8 +21,8 @@ module MARC
       BASIC_LATIN = 0x42
       ANSEL = 0x45
-      G0_SET = ['(', ',', '$']
-      G1_SET = [')', '-', '$']
+      G0_SET = ["(", ",", "$"]
+      G1_SET = [")", "-", "$"]
       CODESETS = MARC::Marc8::MapToUnicode::CODESETS
@@ -63,10 +60,9 @@ module MARC
       # it's not already, if it's Marc8 there's no good reason for it not to
       # be already.
       def transcode(marc8_string, options = {})
-        invalid_replacement     = options.fetch(:replace, "\uFFFD")
-        expand_ncr              = options.fetch(:expand_ncr, true)
-        normalization           = options.fetch(:normalization, :nfc)
+        invalid_replacement = options.fetch(:replace, "\uFFFD")
+        expand_ncr = options.fetch(:expand_ncr, true)
+        normalization = options.fetch(:normalization, :nfc)
         # don't choke on empty marc8_string
         return "" if marc8_string.nil? || marc8_string.empty?
@@ -82,91 +78,89 @@ module MARC
         combinings = []
         pos = 0
         while pos < marc8_string.length
-            if marc8_string[pos] == "\x1b"
-                next_byte = marc8_string[pos+1]
-                if G0_SET.include? next_byte
-                    if marc8_string.length >= pos + 3
-                        if marc8_string[pos+2] == ',' and next_byte == '$'
-                            pos += 1
-                        end
-                        self.g0 = marc8_string[pos+2].ord
-                        pos = pos + 3
-                        next
-                    else
-                        # if there aren't enough remaining characters, readd
-                        # the escape character so it doesn't get lost; may
-                        # help users diagnose problem records
-                        uni_list.push marc8_string[pos]
-                        pos += 1
-                        next
-                    end
-                elsif G1_SET.include? next_byte
-                    if marc8_string[pos+2] == '-' and next_byte == '$'
-                        pos += 1
-                    end
-                    self.g1 = marc8_string[pos+2].ord
-                    pos = pos + 3
-                    next
-                else
-                    charset = next_byte.ord
-                    if CODESETS.has_key? charset
-                        self.g0 = charset
-                        pos += 2
-                    elsif charset == 0x73
-                        self.g0 = BASIC_LATIN
-                        pos += 2
-                        if pos == marc8_string.length
-                            break
-                        end
-                    end
+          if marc8_string[pos] == "\x1b"
+            next_byte = marc8_string[pos + 1]
+            if G0_SET.include? next_byte
+              if marc8_string.length >= pos + 3
+                if (marc8_string[pos + 2] == ",") && (next_byte == "$")
+                  pos += 1
                 end
-            end
-            mb_flag = is_multibyte(self.g0)
-            if mb_flag
-                code_point = (marc8_string[pos].ord * 65536 +
-                     marc8_string[pos+1].ord * 256 +
-                     marc8_string[pos+2].ord)
+                self.g0 = marc8_string[pos + 2].ord
                 pos += 3
-            else
-                code_point = marc8_string[pos].ord
+              else
+                # if there aren't enough remaining characters, readd
+                # the escape character so it doesn't get lost; may
+                # help users diagnose problem records
+                uni_list.push marc8_string[pos]
                 pos += 1
+              end
+              next
+            elsif G1_SET.include? next_byte
+              if (marc8_string[pos + 2] == "-") && (next_byte == "$")
+                pos += 1
+              end
+              self.g1 = marc8_string[pos + 2].ord
+              pos += 3
+              next
+            else
+              charset = next_byte.ord
+              if CODESETS.has_key? charset
+                self.g0 = charset
+                pos += 2
+              elsif charset == 0x73
+                self.g0 = BASIC_LATIN
+                pos += 2
+                if pos == marc8_string.length
+                  break
+                end
+              end
             end
+          end
-            if (code_point < 0x20 or
-                (code_point > 0x80 and code_point < 0xa0))
-                uni = unichr(code_point)
-                next
-            end
+          mb_flag = is_multibyte(g0)
-            begin
-              code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
-              (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
+          if mb_flag
+            code_point = (marc8_string[pos].ord * 65536 +
+              marc8_string[pos + 1].ord * 256 +
+              marc8_string[pos + 2].ord)
+            pos += 3
+          else
+            code_point = marc8_string[pos].ord
+            pos += 1
+          end
-              if cflag
-                  combinings.push unichr(uni)
-              else
-                  uni_list.push unichr(uni)
-                  if combinings.length > 0
-                      uni_list.concat combinings
-                      combinings = []
-                  end
-              end
-            rescue KeyError
-              if options[:invalid] == :replace
-                # Let's coallesece multiple replacements
-                uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
-                pos += 1
-              else
-                raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
+          if (code_point < 0x20) ||
+              ((code_point > 0x80) && (code_point < 0xa0))
+            uni = unichr(code_point)
+            next
+          end
+          begin
+            code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
+            (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
+            if cflag
+              combinings.push unichr(uni)
+            else
+              uni_list.push unichr(uni)
+              if combinings.length > 0
+                uni_list.concat combinings
+                combinings = []
               end
             end
+          rescue KeyError
+            if options[:invalid] == :replace
+              # Let's coallesece multiple replacements
+              uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
+              pos += 1
+            else
+              raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
+            end
+          end
         end
         # what to do if combining chars left over?
-        uni_str = uni_list.join('')
+        uni_str = uni_list.join("")
         if expand_ncr
           uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
@@ -175,10 +169,10 @@ module MARC
         end
         if normalization
-          uni_str = UNF::Normalizer.normalize(uni_str, normalization)
+          uni_str = uni_str.unicode_normalize(normalization)
         end
-        return uni_str
+        uni_str
       end
       # from the original python, yeah, apparently
@@ -192,7 +186,6 @@ module MARC
       def unichr(code_point)
         [code_point].pack("U")
       end
     end
   end
 end