marc 1.1.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +116 -30
- data/Gemfile +5 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -87
- data/lib/marc/reader.rb +116 -124
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +28 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -34
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +129 -22
@@ -1,8 +1,5 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'marc'
|
4
|
-
require 'marc/marc8/map_to_unicode'
|
5
|
-
require 'unf/normalizer'
|
1
|
+
require "marc"
|
2
|
+
require "marc/marc8/map_to_unicode"
|
6
3
|
|
7
4
|
module MARC
|
8
5
|
module Marc8
|
@@ -24,8 +21,8 @@ module MARC
|
|
24
21
|
BASIC_LATIN = 0x42
|
25
22
|
ANSEL = 0x45
|
26
23
|
|
27
|
-
G0_SET = [
|
28
|
-
G1_SET = [
|
24
|
+
G0_SET = ["(", ",", "$"]
|
25
|
+
G1_SET = [")", "-", "$"]
|
29
26
|
|
30
27
|
CODESETS = MARC::Marc8::MapToUnicode::CODESETS
|
31
28
|
|
@@ -63,10 +60,9 @@ module MARC
|
|
63
60
|
# it's not already, if it's Marc8 there's no good reason for it not to
|
64
61
|
# be already.
|
65
62
|
def transcode(marc8_string, options = {})
|
66
|
-
invalid_replacement
|
67
|
-
expand_ncr
|
68
|
-
normalization
|
69
|
-
|
63
|
+
invalid_replacement = options.fetch(:replace, "\uFFFD")
|
64
|
+
expand_ncr = options.fetch(:expand_ncr, true)
|
65
|
+
normalization = options.fetch(:normalization, :nfc)
|
70
66
|
|
71
67
|
# don't choke on empty marc8_string
|
72
68
|
return "" if marc8_string.nil? || marc8_string.empty?
|
@@ -82,91 +78,89 @@ module MARC
|
|
82
78
|
combinings = []
|
83
79
|
pos = 0
|
84
80
|
while pos < marc8_string.length
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
92
|
-
self.g0 = marc8_string[pos+2].ord
|
93
|
-
pos = pos + 3
|
94
|
-
next
|
95
|
-
else
|
96
|
-
# if there aren't enough remaining characters, readd
|
97
|
-
# the escape character so it doesn't get lost; may
|
98
|
-
# help users diagnose problem records
|
99
|
-
uni_list.push marc8_string[pos]
|
100
|
-
pos += 1
|
101
|
-
next
|
102
|
-
end
|
103
|
-
|
104
|
-
elsif G1_SET.include? next_byte
|
105
|
-
if marc8_string[pos+2] == '-' and next_byte == '$'
|
106
|
-
pos += 1
|
107
|
-
end
|
108
|
-
self.g1 = marc8_string[pos+2].ord
|
109
|
-
pos = pos + 3
|
110
|
-
next
|
111
|
-
else
|
112
|
-
charset = next_byte.ord
|
113
|
-
if CODESETS.has_key? charset
|
114
|
-
self.g0 = charset
|
115
|
-
pos += 2
|
116
|
-
elsif charset == 0x73
|
117
|
-
self.g0 = BASIC_LATIN
|
118
|
-
pos += 2
|
119
|
-
if pos == marc8_string.length
|
120
|
-
break
|
121
|
-
end
|
122
|
-
end
|
81
|
+
if marc8_string[pos] == "\x1b"
|
82
|
+
next_byte = marc8_string[pos + 1]
|
83
|
+
if G0_SET.include? next_byte
|
84
|
+
if marc8_string.length >= pos + 3
|
85
|
+
if (marc8_string[pos + 2] == ",") && (next_byte == "$")
|
86
|
+
pos += 1
|
123
87
|
end
|
124
|
-
|
125
|
-
|
126
|
-
mb_flag = is_multibyte(self.g0)
|
127
|
-
|
128
|
-
if mb_flag
|
129
|
-
code_point = (marc8_string[pos].ord * 65536 +
|
130
|
-
marc8_string[pos+1].ord * 256 +
|
131
|
-
marc8_string[pos+2].ord)
|
88
|
+
self.g0 = marc8_string[pos + 2].ord
|
132
89
|
pos += 3
|
133
|
-
|
134
|
-
|
90
|
+
else
|
91
|
+
# if there aren't enough remaining characters, readd
|
92
|
+
# the escape character so it doesn't get lost; may
|
93
|
+
# help users diagnose problem records
|
94
|
+
uni_list.push marc8_string[pos]
|
135
95
|
pos += 1
|
96
|
+
end
|
97
|
+
next
|
98
|
+
elsif G1_SET.include? next_byte
|
99
|
+
if (marc8_string[pos + 2] == "-") && (next_byte == "$")
|
100
|
+
pos += 1
|
101
|
+
end
|
102
|
+
self.g1 = marc8_string[pos + 2].ord
|
103
|
+
pos += 3
|
104
|
+
next
|
105
|
+
else
|
106
|
+
charset = next_byte.ord
|
107
|
+
if CODESETS.has_key? charset
|
108
|
+
self.g0 = charset
|
109
|
+
pos += 2
|
110
|
+
elsif charset == 0x73
|
111
|
+
self.g0 = BASIC_LATIN
|
112
|
+
pos += 2
|
113
|
+
if pos == marc8_string.length
|
114
|
+
break
|
115
|
+
end
|
116
|
+
end
|
136
117
|
end
|
118
|
+
end
|
137
119
|
|
138
|
-
|
139
|
-
(code_point > 0x80 and code_point < 0xa0))
|
140
|
-
uni = unichr(code_point)
|
141
|
-
next
|
142
|
-
end
|
120
|
+
mb_flag = is_multibyte(g0)
|
143
121
|
|
144
|
-
|
145
|
-
|
146
|
-
|
122
|
+
if mb_flag
|
123
|
+
code_point = (marc8_string[pos].ord * 65536 +
|
124
|
+
marc8_string[pos + 1].ord * 256 +
|
125
|
+
marc8_string[pos + 2].ord)
|
126
|
+
pos += 3
|
127
|
+
else
|
128
|
+
code_point = marc8_string[pos].ord
|
129
|
+
pos += 1
|
130
|
+
end
|
147
131
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
132
|
+
if (code_point < 0x20) ||
|
133
|
+
((code_point > 0x80) && (code_point < 0xa0))
|
134
|
+
uni = unichr(code_point)
|
135
|
+
next
|
136
|
+
end
|
137
|
+
|
138
|
+
begin
|
139
|
+
code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
|
140
|
+
(uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
|
141
|
+
|
142
|
+
if cflag
|
143
|
+
combinings.push unichr(uni)
|
144
|
+
else
|
145
|
+
uni_list.push unichr(uni)
|
146
|
+
if combinings.length > 0
|
147
|
+
uni_list.concat combinings
|
148
|
+
combinings = []
|
164
149
|
end
|
165
150
|
end
|
151
|
+
rescue KeyError
|
152
|
+
if options[:invalid] == :replace
|
153
|
+
# Let's coallesece multiple replacements
|
154
|
+
uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
|
155
|
+
pos += 1
|
156
|
+
else
|
157
|
+
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
|
158
|
+
end
|
159
|
+
end
|
166
160
|
end
|
167
161
|
|
168
162
|
# what to do if combining chars left over?
|
169
|
-
uni_str = uni_list.join(
|
163
|
+
uni_str = uni_list.join("")
|
170
164
|
|
171
165
|
if expand_ncr
|
172
166
|
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
@@ -175,10 +169,10 @@ module MARC
|
|
175
169
|
end
|
176
170
|
|
177
171
|
if normalization
|
178
|
-
uni_str =
|
172
|
+
uni_str = uni_str.unicode_normalize(normalization)
|
179
173
|
end
|
180
174
|
|
181
|
-
|
175
|
+
uni_str
|
182
176
|
end
|
183
177
|
|
184
178
|
# from the original python, yeah, apparently
|
@@ -192,7 +186,6 @@ module MARC
|
|
192
186
|
def unichr(code_point)
|
193
187
|
[code_point].pack("U")
|
194
188
|
end
|
195
|
-
|
196
189
|
end
|
197
190
|
end
|
198
191
|
end
|