marc 1.1.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +116 -30
  7. data/Gemfile +5 -0
  8. data/README.md +239 -46
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -87
  22. data/lib/marc/reader.rb +116 -124
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -82
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +28 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -34
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +101 -94
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +129 -22
@@ -1,8 +1,5 @@
1
- # encoding: UTF-8
2
-
3
- require 'marc'
4
- require 'marc/marc8/map_to_unicode'
5
- require 'unf/normalizer'
1
+ require "marc"
2
+ require "marc/marc8/map_to_unicode"
6
3
 
7
4
  module MARC
8
5
  module Marc8
@@ -24,8 +21,8 @@ module MARC
24
21
  BASIC_LATIN = 0x42
25
22
  ANSEL = 0x45
26
23
 
27
- G0_SET = ['(', ',', '$']
28
- G1_SET = [')', '-', '$']
24
+ G0_SET = ["(", ",", "$"]
25
+ G1_SET = [")", "-", "$"]
29
26
 
30
27
  CODESETS = MARC::Marc8::MapToUnicode::CODESETS
31
28
 
@@ -63,10 +60,9 @@ module MARC
63
60
  # it's not already, if it's Marc8 there's no good reason for it not to
64
61
  # be already.
65
62
  def transcode(marc8_string, options = {})
66
- invalid_replacement = options.fetch(:replace, "\uFFFD")
67
- expand_ncr = options.fetch(:expand_ncr, true)
68
- normalization = options.fetch(:normalization, :nfc)
69
-
63
+ invalid_replacement = options.fetch(:replace, "\uFFFD")
64
+ expand_ncr = options.fetch(:expand_ncr, true)
65
+ normalization = options.fetch(:normalization, :nfc)
70
66
 
71
67
  # don't choke on empty marc8_string
72
68
  return "" if marc8_string.nil? || marc8_string.empty?
@@ -82,91 +78,89 @@ module MARC
82
78
  combinings = []
83
79
  pos = 0
84
80
  while pos < marc8_string.length
85
- if marc8_string[pos] == "\x1b"
86
- next_byte = marc8_string[pos+1]
87
- if G0_SET.include? next_byte
88
- if marc8_string.length >= pos + 3
89
- if marc8_string[pos+2] == ',' and next_byte == '$'
90
- pos += 1
91
- end
92
- self.g0 = marc8_string[pos+2].ord
93
- pos = pos + 3
94
- next
95
- else
96
- # if there aren't enough remaining characters, readd
97
- # the escape character so it doesn't get lost; may
98
- # help users diagnose problem records
99
- uni_list.push marc8_string[pos]
100
- pos += 1
101
- next
102
- end
103
-
104
- elsif G1_SET.include? next_byte
105
- if marc8_string[pos+2] == '-' and next_byte == '$'
106
- pos += 1
107
- end
108
- self.g1 = marc8_string[pos+2].ord
109
- pos = pos + 3
110
- next
111
- else
112
- charset = next_byte.ord
113
- if CODESETS.has_key? charset
114
- self.g0 = charset
115
- pos += 2
116
- elsif charset == 0x73
117
- self.g0 = BASIC_LATIN
118
- pos += 2
119
- if pos == marc8_string.length
120
- break
121
- end
122
- end
81
+ if marc8_string[pos] == "\x1b"
82
+ next_byte = marc8_string[pos + 1]
83
+ if G0_SET.include? next_byte
84
+ if marc8_string.length >= pos + 3
85
+ if (marc8_string[pos + 2] == ",") && (next_byte == "$")
86
+ pos += 1
123
87
  end
124
- end
125
-
126
- mb_flag = is_multibyte(self.g0)
127
-
128
- if mb_flag
129
- code_point = (marc8_string[pos].ord * 65536 +
130
- marc8_string[pos+1].ord * 256 +
131
- marc8_string[pos+2].ord)
88
+ self.g0 = marc8_string[pos + 2].ord
132
89
  pos += 3
133
- else
134
- code_point = marc8_string[pos].ord
90
+ else
91
+ # if there aren't enough remaining characters, readd
92
+ # the escape character so it doesn't get lost; may
93
+ # help users diagnose problem records
94
+ uni_list.push marc8_string[pos]
135
95
  pos += 1
96
+ end
97
+ next
98
+ elsif G1_SET.include? next_byte
99
+ if (marc8_string[pos + 2] == "-") && (next_byte == "$")
100
+ pos += 1
101
+ end
102
+ self.g1 = marc8_string[pos + 2].ord
103
+ pos += 3
104
+ next
105
+ else
106
+ charset = next_byte.ord
107
+ if CODESETS.has_key? charset
108
+ self.g0 = charset
109
+ pos += 2
110
+ elsif charset == 0x73
111
+ self.g0 = BASIC_LATIN
112
+ pos += 2
113
+ if pos == marc8_string.length
114
+ break
115
+ end
116
+ end
136
117
  end
118
+ end
137
119
 
138
- if (code_point < 0x20 or
139
- (code_point > 0x80 and code_point < 0xa0))
140
- uni = unichr(code_point)
141
- next
142
- end
120
+ mb_flag = is_multibyte(g0)
143
121
 
144
- begin
145
- code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
- (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
122
+ if mb_flag
123
+ code_point = (marc8_string[pos].ord * 65536 +
124
+ marc8_string[pos + 1].ord * 256 +
125
+ marc8_string[pos + 2].ord)
126
+ pos += 3
127
+ else
128
+ code_point = marc8_string[pos].ord
129
+ pos += 1
130
+ end
147
131
 
148
- if cflag
149
- combinings.push unichr(uni)
150
- else
151
- uni_list.push unichr(uni)
152
- if combinings.length > 0
153
- uni_list.concat combinings
154
- combinings = []
155
- end
156
- end
157
- rescue KeyError
158
- if options[:invalid] == :replace
159
- # Let's coallesece multiple replacements
160
- uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
- pos += 1
162
- else
163
- raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
132
+ if (code_point < 0x20) ||
133
+ ((code_point > 0x80) && (code_point < 0xa0))
134
+ uni = unichr(code_point)
135
+ next
136
+ end
137
+
138
+ begin
139
+ code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
140
+ (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
141
+
142
+ if cflag
143
+ combinings.push unichr(uni)
144
+ else
145
+ uni_list.push unichr(uni)
146
+ if combinings.length > 0
147
+ uni_list.concat combinings
148
+ combinings = []
164
149
  end
165
150
  end
151
+ rescue KeyError
152
+ if options[:invalid] == :replace
153
+ # Let's coallesece multiple replacements
154
+ uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
155
+ pos += 1
156
+ else
157
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
158
+ end
159
+ end
166
160
  end
167
161
 
168
162
  # what to do if combining chars left over?
169
- uni_str = uni_list.join('')
163
+ uni_str = uni_list.join("")
170
164
 
171
165
  if expand_ncr
172
166
  uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
@@ -175,10 +169,10 @@ module MARC
175
169
  end
176
170
 
177
171
  if normalization
178
- uni_str = UNF::Normalizer.normalize(uni_str, normalization)
172
+ uni_str = uni_str.unicode_normalize(normalization)
179
173
  end
180
174
 
181
- return uni_str
175
+ uni_str
182
176
  end
183
177
 
184
178
  # from the original python, yeah, apparently
@@ -192,7 +186,6 @@ module MARC
192
186
  def unichr(code_point)
193
187
  [code_point].pack("U")
194
188
  end
195
-
196
189
  end
197
190
  end
198
191
  end