marc 1.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +116 -30
- data/Gemfile +5 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -87
- data/lib/marc/reader.rb +116 -124
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +28 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -34
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +129 -22
@@ -1,8 +1,5 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'marc'
|
4
|
-
require 'marc/marc8/map_to_unicode'
|
5
|
-
require 'unf/normalizer'
|
1
|
+
require "marc"
|
2
|
+
require "marc/marc8/map_to_unicode"
|
6
3
|
|
7
4
|
module MARC
|
8
5
|
module Marc8
|
@@ -24,8 +21,8 @@ module MARC
|
|
24
21
|
BASIC_LATIN = 0x42
|
25
22
|
ANSEL = 0x45
|
26
23
|
|
27
|
-
G0_SET = [
|
28
|
-
G1_SET = [
|
24
|
+
G0_SET = ["(", ",", "$"]
|
25
|
+
G1_SET = [")", "-", "$"]
|
29
26
|
|
30
27
|
CODESETS = MARC::Marc8::MapToUnicode::CODESETS
|
31
28
|
|
@@ -63,10 +60,9 @@ module MARC
|
|
63
60
|
# it's not already, if it's Marc8 there's no good reason for it not to
|
64
61
|
# be already.
|
65
62
|
def transcode(marc8_string, options = {})
|
66
|
-
invalid_replacement
|
67
|
-
expand_ncr
|
68
|
-
normalization
|
69
|
-
|
63
|
+
invalid_replacement = options.fetch(:replace, "\uFFFD")
|
64
|
+
expand_ncr = options.fetch(:expand_ncr, true)
|
65
|
+
normalization = options.fetch(:normalization, :nfc)
|
70
66
|
|
71
67
|
# don't choke on empty marc8_string
|
72
68
|
return "" if marc8_string.nil? || marc8_string.empty?
|
@@ -82,91 +78,89 @@ module MARC
|
|
82
78
|
combinings = []
|
83
79
|
pos = 0
|
84
80
|
while pos < marc8_string.length
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
92
|
-
self.g0 = marc8_string[pos+2].ord
|
93
|
-
pos = pos + 3
|
94
|
-
next
|
95
|
-
else
|
96
|
-
# if there aren't enough remaining characters, readd
|
97
|
-
# the escape character so it doesn't get lost; may
|
98
|
-
# help users diagnose problem records
|
99
|
-
uni_list.push marc8_string[pos]
|
100
|
-
pos += 1
|
101
|
-
next
|
102
|
-
end
|
103
|
-
|
104
|
-
elsif G1_SET.include? next_byte
|
105
|
-
if marc8_string[pos+2] == '-' and next_byte == '$'
|
106
|
-
pos += 1
|
107
|
-
end
|
108
|
-
self.g1 = marc8_string[pos+2].ord
|
109
|
-
pos = pos + 3
|
110
|
-
next
|
111
|
-
else
|
112
|
-
charset = next_byte.ord
|
113
|
-
if CODESETS.has_key? charset
|
114
|
-
self.g0 = charset
|
115
|
-
pos += 2
|
116
|
-
elsif charset == 0x73
|
117
|
-
self.g0 = BASIC_LATIN
|
118
|
-
pos += 2
|
119
|
-
if pos == marc8_string.length
|
120
|
-
break
|
121
|
-
end
|
122
|
-
end
|
81
|
+
if marc8_string[pos] == "\x1b"
|
82
|
+
next_byte = marc8_string[pos + 1]
|
83
|
+
if G0_SET.include? next_byte
|
84
|
+
if marc8_string.length >= pos + 3
|
85
|
+
if (marc8_string[pos + 2] == ",") && (next_byte == "$")
|
86
|
+
pos += 1
|
123
87
|
end
|
124
|
-
|
125
|
-
|
126
|
-
mb_flag = is_multibyte(self.g0)
|
127
|
-
|
128
|
-
if mb_flag
|
129
|
-
code_point = (marc8_string[pos].ord * 65536 +
|
130
|
-
marc8_string[pos+1].ord * 256 +
|
131
|
-
marc8_string[pos+2].ord)
|
88
|
+
self.g0 = marc8_string[pos + 2].ord
|
132
89
|
pos += 3
|
133
|
-
|
134
|
-
|
90
|
+
else
|
91
|
+
# if there aren't enough remaining characters, readd
|
92
|
+
# the escape character so it doesn't get lost; may
|
93
|
+
# help users diagnose problem records
|
94
|
+
uni_list.push marc8_string[pos]
|
135
95
|
pos += 1
|
96
|
+
end
|
97
|
+
next
|
98
|
+
elsif G1_SET.include? next_byte
|
99
|
+
if (marc8_string[pos + 2] == "-") && (next_byte == "$")
|
100
|
+
pos += 1
|
101
|
+
end
|
102
|
+
self.g1 = marc8_string[pos + 2].ord
|
103
|
+
pos += 3
|
104
|
+
next
|
105
|
+
else
|
106
|
+
charset = next_byte.ord
|
107
|
+
if CODESETS.has_key? charset
|
108
|
+
self.g0 = charset
|
109
|
+
pos += 2
|
110
|
+
elsif charset == 0x73
|
111
|
+
self.g0 = BASIC_LATIN
|
112
|
+
pos += 2
|
113
|
+
if pos == marc8_string.length
|
114
|
+
break
|
115
|
+
end
|
116
|
+
end
|
136
117
|
end
|
118
|
+
end
|
137
119
|
|
138
|
-
|
139
|
-
(code_point > 0x80 and code_point < 0xa0))
|
140
|
-
uni = unichr(code_point)
|
141
|
-
next
|
142
|
-
end
|
120
|
+
mb_flag = is_multibyte(g0)
|
143
121
|
|
144
|
-
|
145
|
-
|
146
|
-
|
122
|
+
if mb_flag
|
123
|
+
code_point = (marc8_string[pos].ord * 65536 +
|
124
|
+
marc8_string[pos + 1].ord * 256 +
|
125
|
+
marc8_string[pos + 2].ord)
|
126
|
+
pos += 3
|
127
|
+
else
|
128
|
+
code_point = marc8_string[pos].ord
|
129
|
+
pos += 1
|
130
|
+
end
|
147
131
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
132
|
+
if (code_point < 0x20) ||
|
133
|
+
((code_point > 0x80) && (code_point < 0xa0))
|
134
|
+
uni = unichr(code_point)
|
135
|
+
next
|
136
|
+
end
|
137
|
+
|
138
|
+
begin
|
139
|
+
code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
|
140
|
+
(uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
|
141
|
+
|
142
|
+
if cflag
|
143
|
+
combinings.push unichr(uni)
|
144
|
+
else
|
145
|
+
uni_list.push unichr(uni)
|
146
|
+
if combinings.length > 0
|
147
|
+
uni_list.concat combinings
|
148
|
+
combinings = []
|
164
149
|
end
|
165
150
|
end
|
151
|
+
rescue KeyError
|
152
|
+
if options[:invalid] == :replace
|
153
|
+
# Let's coallesece multiple replacements
|
154
|
+
uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
|
155
|
+
pos += 1
|
156
|
+
else
|
157
|
+
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
|
158
|
+
end
|
159
|
+
end
|
166
160
|
end
|
167
161
|
|
168
162
|
# what to do if combining chars left over?
|
169
|
-
uni_str = uni_list.join(
|
163
|
+
uni_str = uni_list.join("")
|
170
164
|
|
171
165
|
if expand_ncr
|
172
166
|
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
@@ -175,10 +169,10 @@ module MARC
|
|
175
169
|
end
|
176
170
|
|
177
171
|
if normalization
|
178
|
-
uni_str =
|
172
|
+
uni_str = uni_str.unicode_normalize(normalization)
|
179
173
|
end
|
180
174
|
|
181
|
-
|
175
|
+
uni_str
|
182
176
|
end
|
183
177
|
|
184
178
|
# from the original python, yeah, apparently
|
@@ -192,7 +186,6 @@ module MARC
|
|
192
186
|
def unichr(code_point)
|
193
187
|
[code_point].pack("U")
|
194
188
|
end
|
195
|
-
|
196
189
|
end
|
197
190
|
end
|
198
191
|
end
|