marc 1.0.4 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +106 -29
  7. data/Gemfile +15 -0
  8. data/README.md +240 -47
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -86
  22. data/lib/marc/reader.rb +119 -121
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -81
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +23 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -32
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +99 -87
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +94 -9
@@ -1,8 +1,6 @@
1
- # encoding: UTF-8
2
-
3
- require 'marc'
4
- require 'marc/marc8/map_to_unicode'
5
- require 'unf/normalizer'
1
+ require "marc"
2
+ require "marc/marc8/map_to_unicode"
3
+ require "unf/normalizer"
6
4
 
7
5
  module MARC
8
6
  module Marc8
@@ -24,8 +22,8 @@ module MARC
24
22
  BASIC_LATIN = 0x42
25
23
  ANSEL = 0x45
26
24
 
27
- G0_SET = ['(', ',', '$']
28
- G1_SET = [')', '-', '$']
25
+ G0_SET = ["(", ",", "$"]
26
+ G1_SET = [")", "-", "$"]
29
27
 
30
28
  CODESETS = MARC::Marc8::MapToUnicode::CODESETS
31
29
 
@@ -63,10 +61,9 @@ module MARC
63
61
  # it's not already, if it's Marc8 there's no good reason for it not to
64
62
  # be already.
65
63
  def transcode(marc8_string, options = {})
66
- invalid_replacement = options.fetch(:replace, "\uFFFD")
67
- expand_ncr = options.fetch(:expand_ncr, true)
68
- normalization = options.fetch(:normalization, :nfc)
69
-
64
+ invalid_replacement = options.fetch(:replace, "\uFFFD")
65
+ expand_ncr = options.fetch(:expand_ncr, true)
66
+ normalization = options.fetch(:normalization, :nfc)
70
67
 
71
68
  # don't choke on empty marc8_string
72
69
  return "" if marc8_string.nil? || marc8_string.empty?
@@ -82,91 +79,89 @@ module MARC
82
79
  combinings = []
83
80
  pos = 0
84
81
  while pos < marc8_string.length
85
- if marc8_string[pos] == "\x1b"
86
- next_byte = marc8_string[pos+1]
87
- if G0_SET.include? next_byte
88
- if marc8_string.length >= pos + 3
89
- if marc8_string[pos+2] == ',' and next_byte == '$'
90
- pos += 1
91
- end
92
- self.g0 = marc8_string[pos+2].ord
93
- pos = pos + 3
94
- next
95
- else
96
- # if there aren't enough remaining characters, readd
97
- # the escape character so it doesn't get lost; may
98
- # help users diagnose problem records
99
- uni_list.push marc8_string[pos]
100
- pos += 1
101
- next
102
- end
103
-
104
- elsif G1_SET.include? next_byte
105
- if marc8_string[pos+2] == '-' and next_byte == '$'
106
- pos += 1
107
- end
108
- self.g1 = marc8_string[pos+2].ord
109
- pos = pos + 3
110
- next
111
- else
112
- charset = next_byte.ord
113
- if CODESETS.has_key? charset
114
- self.g0 = charset
115
- pos += 2
116
- elsif charset == 0x73
117
- self.g0 = BASIC_LATIN
118
- pos += 2
119
- if pos == marc8_string.length
120
- break
121
- end
122
- end
82
+ if marc8_string[pos] == "\x1b"
83
+ next_byte = marc8_string[pos + 1]
84
+ if G0_SET.include? next_byte
85
+ if marc8_string.length >= pos + 3
86
+ if (marc8_string[pos + 2] == ",") && (next_byte == "$")
87
+ pos += 1
123
88
  end
124
- end
125
-
126
- mb_flag = is_multibyte(self.g0)
127
-
128
- if mb_flag
129
- code_point = (marc8_string[pos].ord * 65536 +
130
- marc8_string[pos+1].ord * 256 +
131
- marc8_string[pos+2].ord)
89
+ self.g0 = marc8_string[pos + 2].ord
132
90
  pos += 3
133
- else
134
- code_point = marc8_string[pos].ord
91
+ else
92
+ # if there aren't enough remaining characters, readd
93
+ # the escape character so it doesn't get lost; may
94
+ # help users diagnose problem records
95
+ uni_list.push marc8_string[pos]
135
96
  pos += 1
97
+ end
98
+ next
99
+ elsif G1_SET.include? next_byte
100
+ if (marc8_string[pos + 2] == "-") && (next_byte == "$")
101
+ pos += 1
102
+ end
103
+ self.g1 = marc8_string[pos + 2].ord
104
+ pos += 3
105
+ next
106
+ else
107
+ charset = next_byte.ord
108
+ if CODESETS.has_key? charset
109
+ self.g0 = charset
110
+ pos += 2
111
+ elsif charset == 0x73
112
+ self.g0 = BASIC_LATIN
113
+ pos += 2
114
+ if pos == marc8_string.length
115
+ break
116
+ end
117
+ end
136
118
  end
119
+ end
137
120
 
138
- if (code_point < 0x20 or
139
- (code_point > 0x80 and code_point < 0xa0))
140
- uni = unichr(code_point)
141
- next
142
- end
121
+ mb_flag = is_multibyte(g0)
143
122
 
144
- begin
145
- code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
- (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
123
+ if mb_flag
124
+ code_point = (marc8_string[pos].ord * 65536 +
125
+ marc8_string[pos + 1].ord * 256 +
126
+ marc8_string[pos + 2].ord)
127
+ pos += 3
128
+ else
129
+ code_point = marc8_string[pos].ord
130
+ pos += 1
131
+ end
147
132
 
148
- if cflag
149
- combinings.push unichr(uni)
150
- else
151
- uni_list.push unichr(uni)
152
- if combinings.length > 0
153
- uni_list.concat combinings
154
- combinings = []
155
- end
156
- end
157
- rescue KeyError
158
- if options[:invalid] == :replace
159
- # Let's coallesece multiple replacements
160
- uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
- pos += 1
162
- else
163
- raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
133
+ if (code_point < 0x20) ||
134
+ ((code_point > 0x80) && (code_point < 0xa0))
135
+ uni = unichr(code_point)
136
+ next
137
+ end
138
+
139
+ begin
140
+ code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
141
+ (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
142
+
143
+ if cflag
144
+ combinings.push unichr(uni)
145
+ else
146
+ uni_list.push unichr(uni)
147
+ if combinings.length > 0
148
+ uni_list.concat combinings
149
+ combinings = []
164
150
  end
165
151
  end
152
+ rescue KeyError
153
+ if options[:invalid] == :replace
154
+ # Let's coallesece multiple replacements
155
+ uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
156
+ pos += 1
157
+ else
158
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
159
+ end
160
+ end
166
161
  end
167
162
 
168
163
  # what to do if combining chars left over?
169
- uni_str = uni_list.join('')
164
+ uni_str = uni_list.join("")
170
165
 
171
166
  if expand_ncr
172
167
  uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
@@ -178,7 +173,7 @@ module MARC
178
173
  uni_str = UNF::Normalizer.normalize(uni_str, normalization)
179
174
  end
180
175
 
181
- return uni_str
176
+ uni_str
182
177
  end
183
178
 
184
179
  # from the original python, yeah, apparently
@@ -192,7 +187,6 @@ module MARC
192
187
  def unichr(code_point)
193
188
  [code_point].pack("U")
194
189
  end
195
-
196
190
  end
197
191
  end
198
192
  end