marc 1.0.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +106 -29
  7. data/Gemfile +15 -0
  8. data/README.md +240 -47
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -86
  22. data/lib/marc/reader.rb +119 -121
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -81
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +23 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -32
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +99 -87
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +94 -9
@@ -1,8 +1,6 @@
1
- # encoding: UTF-8
2
-
3
- require 'marc'
4
- require 'marc/marc8/map_to_unicode'
5
- require 'unf/normalizer'
1
+ require "marc"
2
+ require "marc/marc8/map_to_unicode"
3
+ require "unf/normalizer"
6
4
 
7
5
  module MARC
8
6
  module Marc8
@@ -24,8 +22,8 @@ module MARC
24
22
  BASIC_LATIN = 0x42
25
23
  ANSEL = 0x45
26
24
 
27
- G0_SET = ['(', ',', '$']
28
- G1_SET = [')', '-', '$']
25
+ G0_SET = ["(", ",", "$"]
26
+ G1_SET = [")", "-", "$"]
29
27
 
30
28
  CODESETS = MARC::Marc8::MapToUnicode::CODESETS
31
29
 
@@ -63,10 +61,9 @@ module MARC
63
61
  # it's not already, if it's Marc8 there's no good reason for it not to
64
62
  # be already.
65
63
  def transcode(marc8_string, options = {})
66
- invalid_replacement = options.fetch(:replace, "\uFFFD")
67
- expand_ncr = options.fetch(:expand_ncr, true)
68
- normalization = options.fetch(:normalization, :nfc)
69
-
64
+ invalid_replacement = options.fetch(:replace, "\uFFFD")
65
+ expand_ncr = options.fetch(:expand_ncr, true)
66
+ normalization = options.fetch(:normalization, :nfc)
70
67
 
71
68
  # don't choke on empty marc8_string
72
69
  return "" if marc8_string.nil? || marc8_string.empty?
@@ -82,91 +79,89 @@ module MARC
82
79
  combinings = []
83
80
  pos = 0
84
81
  while pos < marc8_string.length
85
- if marc8_string[pos] == "\x1b"
86
- next_byte = marc8_string[pos+1]
87
- if G0_SET.include? next_byte
88
- if marc8_string.length >= pos + 3
89
- if marc8_string[pos+2] == ',' and next_byte == '$'
90
- pos += 1
91
- end
92
- self.g0 = marc8_string[pos+2].ord
93
- pos = pos + 3
94
- next
95
- else
96
- # if there aren't enough remaining characters, readd
97
- # the escape character so it doesn't get lost; may
98
- # help users diagnose problem records
99
- uni_list.push marc8_string[pos]
100
- pos += 1
101
- next
102
- end
103
-
104
- elsif G1_SET.include? next_byte
105
- if marc8_string[pos+2] == '-' and next_byte == '$'
106
- pos += 1
107
- end
108
- self.g1 = marc8_string[pos+2].ord
109
- pos = pos + 3
110
- next
111
- else
112
- charset = next_byte.ord
113
- if CODESETS.has_key? charset
114
- self.g0 = charset
115
- pos += 2
116
- elsif charset == 0x73
117
- self.g0 = BASIC_LATIN
118
- pos += 2
119
- if pos == marc8_string.length
120
- break
121
- end
122
- end
82
+ if marc8_string[pos] == "\x1b"
83
+ next_byte = marc8_string[pos + 1]
84
+ if G0_SET.include? next_byte
85
+ if marc8_string.length >= pos + 3
86
+ if (marc8_string[pos + 2] == ",") && (next_byte == "$")
87
+ pos += 1
123
88
  end
124
- end
125
-
126
- mb_flag = is_multibyte(self.g0)
127
-
128
- if mb_flag
129
- code_point = (marc8_string[pos].ord * 65536 +
130
- marc8_string[pos+1].ord * 256 +
131
- marc8_string[pos+2].ord)
89
+ self.g0 = marc8_string[pos + 2].ord
132
90
  pos += 3
133
- else
134
- code_point = marc8_string[pos].ord
91
+ else
92
+ # if there aren't enough remaining characters, readd
93
+ # the escape character so it doesn't get lost; may
94
+ # help users diagnose problem records
95
+ uni_list.push marc8_string[pos]
135
96
  pos += 1
97
+ end
98
+ next
99
+ elsif G1_SET.include? next_byte
100
+ if (marc8_string[pos + 2] == "-") && (next_byte == "$")
101
+ pos += 1
102
+ end
103
+ self.g1 = marc8_string[pos + 2].ord
104
+ pos += 3
105
+ next
106
+ else
107
+ charset = next_byte.ord
108
+ if CODESETS.has_key? charset
109
+ self.g0 = charset
110
+ pos += 2
111
+ elsif charset == 0x73
112
+ self.g0 = BASIC_LATIN
113
+ pos += 2
114
+ if pos == marc8_string.length
115
+ break
116
+ end
117
+ end
136
118
  end
119
+ end
137
120
 
138
- if (code_point < 0x20 or
139
- (code_point > 0x80 and code_point < 0xa0))
140
- uni = unichr(code_point)
141
- next
142
- end
121
+ mb_flag = is_multibyte(g0)
143
122
 
144
- begin
145
- code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
- (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
123
+ if mb_flag
124
+ code_point = (marc8_string[pos].ord * 65536 +
125
+ marc8_string[pos + 1].ord * 256 +
126
+ marc8_string[pos + 2].ord)
127
+ pos += 3
128
+ else
129
+ code_point = marc8_string[pos].ord
130
+ pos += 1
131
+ end
147
132
 
148
- if cflag
149
- combinings.push unichr(uni)
150
- else
151
- uni_list.push unichr(uni)
152
- if combinings.length > 0
153
- uni_list.concat combinings
154
- combinings = []
155
- end
156
- end
157
- rescue KeyError
158
- if options[:invalid] == :replace
159
- # Let's coallesece multiple replacements
160
- uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
- pos += 1
162
- else
163
- raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
133
+ if (code_point < 0x20) ||
134
+ ((code_point > 0x80) && (code_point < 0xa0))
135
+ uni = unichr(code_point)
136
+ next
137
+ end
138
+
139
+ begin
140
+ code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
141
+ (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
142
+
143
+ if cflag
144
+ combinings.push unichr(uni)
145
+ else
146
+ uni_list.push unichr(uni)
147
+ if combinings.length > 0
148
+ uni_list.concat combinings
149
+ combinings = []
164
150
  end
165
151
  end
152
+ rescue KeyError
153
+ if options[:invalid] == :replace
154
+ # Let's coallesece multiple replacements
155
+ uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
156
+ pos += 1
157
+ else
158
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
159
+ end
160
+ end
166
161
  end
167
162
 
168
163
  # what to do if combining chars left over?
169
- uni_str = uni_list.join('')
164
+ uni_str = uni_list.join("")
170
165
 
171
166
  if expand_ncr
172
167
  uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
@@ -178,7 +173,7 @@ module MARC
178
173
  uni_str = UNF::Normalizer.normalize(uni_str, normalization)
179
174
  end
180
175
 
181
- return uni_str
176
+ uni_str
182
177
  end
183
178
 
184
179
  # from the original python, yeah, apparently
@@ -192,7 +187,6 @@ module MARC
192
187
  def unichr(code_point)
193
188
  [code_point].pack("U")
194
189
  end
195
-
196
190
  end
197
191
  end
198
192
  end