marc 1.1.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +116 -30
  7. data/Gemfile +5 -0
  8. data/README.md +239 -46
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -87
  22. data/lib/marc/reader.rb +116 -124
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -82
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +28 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -34
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +101 -94
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +129 -22
@@ -1,8 +1,5 @@
1
- # encoding: UTF-8
2
-
3
- require 'marc'
4
- require 'marc/marc8/map_to_unicode'
5
- require 'unf/normalizer'
1
+ require "marc"
2
+ require "marc/marc8/map_to_unicode"
6
3
 
7
4
  module MARC
8
5
  module Marc8
@@ -24,8 +21,8 @@ module MARC
24
21
  BASIC_LATIN = 0x42
25
22
  ANSEL = 0x45
26
23
 
27
- G0_SET = ['(', ',', '$']
28
- G1_SET = [')', '-', '$']
24
+ G0_SET = ["(", ",", "$"]
25
+ G1_SET = [")", "-", "$"]
29
26
 
30
27
  CODESETS = MARC::Marc8::MapToUnicode::CODESETS
31
28
 
@@ -63,10 +60,9 @@ module MARC
63
60
  # it's not already, if it's Marc8 there's no good reason for it not to
64
61
  # be already.
65
62
  def transcode(marc8_string, options = {})
66
- invalid_replacement = options.fetch(:replace, "\uFFFD")
67
- expand_ncr = options.fetch(:expand_ncr, true)
68
- normalization = options.fetch(:normalization, :nfc)
69
-
63
+ invalid_replacement = options.fetch(:replace, "\uFFFD")
64
+ expand_ncr = options.fetch(:expand_ncr, true)
65
+ normalization = options.fetch(:normalization, :nfc)
70
66
 
71
67
  # don't choke on empty marc8_string
72
68
  return "" if marc8_string.nil? || marc8_string.empty?
@@ -82,91 +78,89 @@ module MARC
82
78
  combinings = []
83
79
  pos = 0
84
80
  while pos < marc8_string.length
85
- if marc8_string[pos] == "\x1b"
86
- next_byte = marc8_string[pos+1]
87
- if G0_SET.include? next_byte
88
- if marc8_string.length >= pos + 3
89
- if marc8_string[pos+2] == ',' and next_byte == '$'
90
- pos += 1
91
- end
92
- self.g0 = marc8_string[pos+2].ord
93
- pos = pos + 3
94
- next
95
- else
96
- # if there aren't enough remaining characters, readd
97
- # the escape character so it doesn't get lost; may
98
- # help users diagnose problem records
99
- uni_list.push marc8_string[pos]
100
- pos += 1
101
- next
102
- end
103
-
104
- elsif G1_SET.include? next_byte
105
- if marc8_string[pos+2] == '-' and next_byte == '$'
106
- pos += 1
107
- end
108
- self.g1 = marc8_string[pos+2].ord
109
- pos = pos + 3
110
- next
111
- else
112
- charset = next_byte.ord
113
- if CODESETS.has_key? charset
114
- self.g0 = charset
115
- pos += 2
116
- elsif charset == 0x73
117
- self.g0 = BASIC_LATIN
118
- pos += 2
119
- if pos == marc8_string.length
120
- break
121
- end
122
- end
81
+ if marc8_string[pos] == "\x1b"
82
+ next_byte = marc8_string[pos + 1]
83
+ if G0_SET.include? next_byte
84
+ if marc8_string.length >= pos + 3
85
+ if (marc8_string[pos + 2] == ",") && (next_byte == "$")
86
+ pos += 1
123
87
  end
124
- end
125
-
126
- mb_flag = is_multibyte(self.g0)
127
-
128
- if mb_flag
129
- code_point = (marc8_string[pos].ord * 65536 +
130
- marc8_string[pos+1].ord * 256 +
131
- marc8_string[pos+2].ord)
88
+ self.g0 = marc8_string[pos + 2].ord
132
89
  pos += 3
133
- else
134
- code_point = marc8_string[pos].ord
90
+ else
91
+ # if there aren't enough remaining characters, readd
92
+ # the escape character so it doesn't get lost; may
93
+ # help users diagnose problem records
94
+ uni_list.push marc8_string[pos]
135
95
  pos += 1
96
+ end
97
+ next
98
+ elsif G1_SET.include? next_byte
99
+ if (marc8_string[pos + 2] == "-") && (next_byte == "$")
100
+ pos += 1
101
+ end
102
+ self.g1 = marc8_string[pos + 2].ord
103
+ pos += 3
104
+ next
105
+ else
106
+ charset = next_byte.ord
107
+ if CODESETS.has_key? charset
108
+ self.g0 = charset
109
+ pos += 2
110
+ elsif charset == 0x73
111
+ self.g0 = BASIC_LATIN
112
+ pos += 2
113
+ if pos == marc8_string.length
114
+ break
115
+ end
116
+ end
136
117
  end
118
+ end
137
119
 
138
- if (code_point < 0x20 or
139
- (code_point > 0x80 and code_point < 0xa0))
140
- uni = unichr(code_point)
141
- next
142
- end
120
+ mb_flag = is_multibyte(g0)
143
121
 
144
- begin
145
- code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
- (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
122
+ if mb_flag
123
+ code_point = (marc8_string[pos].ord * 65536 +
124
+ marc8_string[pos + 1].ord * 256 +
125
+ marc8_string[pos + 2].ord)
126
+ pos += 3
127
+ else
128
+ code_point = marc8_string[pos].ord
129
+ pos += 1
130
+ end
147
131
 
148
- if cflag
149
- combinings.push unichr(uni)
150
- else
151
- uni_list.push unichr(uni)
152
- if combinings.length > 0
153
- uni_list.concat combinings
154
- combinings = []
155
- end
156
- end
157
- rescue KeyError
158
- if options[:invalid] == :replace
159
- # Let's coallesece multiple replacements
160
- uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
- pos += 1
162
- else
163
- raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
132
+ if (code_point < 0x20) ||
133
+ ((code_point > 0x80) && (code_point < 0xa0))
134
+ uni = unichr(code_point)
135
+ next
136
+ end
137
+
138
+ begin
139
+ code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
140
+ (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
141
+
142
+ if cflag
143
+ combinings.push unichr(uni)
144
+ else
145
+ uni_list.push unichr(uni)
146
+ if combinings.length > 0
147
+ uni_list.concat combinings
148
+ combinings = []
164
149
  end
165
150
  end
151
+ rescue KeyError
152
+ if options[:invalid] == :replace
153
+ # Let's coallesece multiple replacements
154
+ uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
155
+ pos += 1
156
+ else
157
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
158
+ end
159
+ end
166
160
  end
167
161
 
168
162
  # what to do if combining chars left over?
169
- uni_str = uni_list.join('')
163
+ uni_str = uni_list.join("")
170
164
 
171
165
  if expand_ncr
172
166
  uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
@@ -175,10 +169,10 @@ module MARC
175
169
  end
176
170
 
177
171
  if normalization
178
- uni_str = UNF::Normalizer.normalize(uni_str, normalization)
172
+ uni_str = uni_str.unicode_normalize(normalization)
179
173
  end
180
174
 
181
- return uni_str
175
+ uni_str
182
176
  end
183
177
 
184
178
  # from the original python, yeah, apparently
@@ -192,7 +186,6 @@ module MARC
192
186
  def unichr(code_point)
193
187
  [code_point].pack("U")
194
188
  end
195
-
196
189
  end
197
190
  end
198
191
  end