marc 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +102 -30
- data/Gemfile +15 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +117 -123
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +80 -9
@@ -1,8 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require
|
4
|
-
require 'marc/marc8/map_to_unicode'
|
5
|
-
require 'unf/normalizer'
|
1
|
+
require "marc"
|
2
|
+
require "marc/marc8/map_to_unicode"
|
3
|
+
require "unf/normalizer"
|
6
4
|
|
7
5
|
module MARC
|
8
6
|
module Marc8
|
@@ -24,8 +22,8 @@ module MARC
|
|
24
22
|
BASIC_LATIN = 0x42
|
25
23
|
ANSEL = 0x45
|
26
24
|
|
27
|
-
G0_SET = [
|
28
|
-
G1_SET = [
|
25
|
+
G0_SET = ["(", ",", "$"]
|
26
|
+
G1_SET = [")", "-", "$"]
|
29
27
|
|
30
28
|
CODESETS = MARC::Marc8::MapToUnicode::CODESETS
|
31
29
|
|
@@ -63,10 +61,9 @@ module MARC
|
|
63
61
|
# it's not already, if it's Marc8 there's no good reason for it not to
|
64
62
|
# be already.
|
65
63
|
def transcode(marc8_string, options = {})
|
66
|
-
invalid_replacement
|
67
|
-
expand_ncr
|
68
|
-
normalization
|
69
|
-
|
64
|
+
invalid_replacement = options.fetch(:replace, "\uFFFD")
|
65
|
+
expand_ncr = options.fetch(:expand_ncr, true)
|
66
|
+
normalization = options.fetch(:normalization, :nfc)
|
70
67
|
|
71
68
|
# don't choke on empty marc8_string
|
72
69
|
return "" if marc8_string.nil? || marc8_string.empty?
|
@@ -82,91 +79,89 @@ module MARC
|
|
82
79
|
combinings = []
|
83
80
|
pos = 0
|
84
81
|
while pos < marc8_string.length
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
92
|
-
self.g0 = marc8_string[pos+2].ord
|
93
|
-
pos = pos + 3
|
94
|
-
next
|
95
|
-
else
|
96
|
-
# if there aren't enough remaining characters, readd
|
97
|
-
# the escape character so it doesn't get lost; may
|
98
|
-
# help users diagnose problem records
|
99
|
-
uni_list.push marc8_string[pos]
|
100
|
-
pos += 1
|
101
|
-
next
|
102
|
-
end
|
103
|
-
|
104
|
-
elsif G1_SET.include? next_byte
|
105
|
-
if marc8_string[pos+2] == '-' and next_byte == '$'
|
106
|
-
pos += 1
|
107
|
-
end
|
108
|
-
self.g1 = marc8_string[pos+2].ord
|
109
|
-
pos = pos + 3
|
110
|
-
next
|
111
|
-
else
|
112
|
-
charset = next_byte.ord
|
113
|
-
if CODESETS.has_key? charset
|
114
|
-
self.g0 = charset
|
115
|
-
pos += 2
|
116
|
-
elsif charset == 0x73
|
117
|
-
self.g0 = BASIC_LATIN
|
118
|
-
pos += 2
|
119
|
-
if pos == marc8_string.length
|
120
|
-
break
|
121
|
-
end
|
122
|
-
end
|
82
|
+
if marc8_string[pos] == "\x1b"
|
83
|
+
next_byte = marc8_string[pos + 1]
|
84
|
+
if G0_SET.include? next_byte
|
85
|
+
if marc8_string.length >= pos + 3
|
86
|
+
if (marc8_string[pos + 2] == ",") && (next_byte == "$")
|
87
|
+
pos += 1
|
123
88
|
end
|
124
|
-
|
125
|
-
|
126
|
-
mb_flag = is_multibyte(self.g0)
|
127
|
-
|
128
|
-
if mb_flag
|
129
|
-
code_point = (marc8_string[pos].ord * 65536 +
|
130
|
-
marc8_string[pos+1].ord * 256 +
|
131
|
-
marc8_string[pos+2].ord)
|
89
|
+
self.g0 = marc8_string[pos + 2].ord
|
132
90
|
pos += 3
|
133
|
-
|
134
|
-
|
91
|
+
else
|
92
|
+
# if there aren't enough remaining characters, readd
|
93
|
+
# the escape character so it doesn't get lost; may
|
94
|
+
# help users diagnose problem records
|
95
|
+
uni_list.push marc8_string[pos]
|
135
96
|
pos += 1
|
97
|
+
end
|
98
|
+
next
|
99
|
+
elsif G1_SET.include? next_byte
|
100
|
+
if (marc8_string[pos + 2] == "-") && (next_byte == "$")
|
101
|
+
pos += 1
|
102
|
+
end
|
103
|
+
self.g1 = marc8_string[pos + 2].ord
|
104
|
+
pos += 3
|
105
|
+
next
|
106
|
+
else
|
107
|
+
charset = next_byte.ord
|
108
|
+
if CODESETS.has_key? charset
|
109
|
+
self.g0 = charset
|
110
|
+
pos += 2
|
111
|
+
elsif charset == 0x73
|
112
|
+
self.g0 = BASIC_LATIN
|
113
|
+
pos += 2
|
114
|
+
if pos == marc8_string.length
|
115
|
+
break
|
116
|
+
end
|
117
|
+
end
|
136
118
|
end
|
119
|
+
end
|
137
120
|
|
138
|
-
|
139
|
-
(code_point > 0x80 and code_point < 0xa0))
|
140
|
-
uni = unichr(code_point)
|
141
|
-
next
|
142
|
-
end
|
121
|
+
mb_flag = is_multibyte(g0)
|
143
122
|
|
144
|
-
|
145
|
-
|
146
|
-
|
123
|
+
if mb_flag
|
124
|
+
code_point = (marc8_string[pos].ord * 65536 +
|
125
|
+
marc8_string[pos + 1].ord * 256 +
|
126
|
+
marc8_string[pos + 2].ord)
|
127
|
+
pos += 3
|
128
|
+
else
|
129
|
+
code_point = marc8_string[pos].ord
|
130
|
+
pos += 1
|
131
|
+
end
|
147
132
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
133
|
+
if (code_point < 0x20) ||
|
134
|
+
((code_point > 0x80) && (code_point < 0xa0))
|
135
|
+
uni = unichr(code_point)
|
136
|
+
next
|
137
|
+
end
|
138
|
+
|
139
|
+
begin
|
140
|
+
code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
|
141
|
+
(uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
|
142
|
+
|
143
|
+
if cflag
|
144
|
+
combinings.push unichr(uni)
|
145
|
+
else
|
146
|
+
uni_list.push unichr(uni)
|
147
|
+
if combinings.length > 0
|
148
|
+
uni_list.concat combinings
|
149
|
+
combinings = []
|
164
150
|
end
|
165
151
|
end
|
152
|
+
rescue KeyError
|
153
|
+
if options[:invalid] == :replace
|
154
|
+
# Let's coallesece multiple replacements
|
155
|
+
uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
|
156
|
+
pos += 1
|
157
|
+
else
|
158
|
+
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
|
159
|
+
end
|
160
|
+
end
|
166
161
|
end
|
167
162
|
|
168
163
|
# what to do if combining chars left over?
|
169
|
-
uni_str = uni_list.join(
|
164
|
+
uni_str = uni_list.join("")
|
170
165
|
|
171
166
|
if expand_ncr
|
172
167
|
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
@@ -178,7 +173,7 @@ module MARC
|
|
178
173
|
uni_str = UNF::Normalizer.normalize(uni_str, normalization)
|
179
174
|
end
|
180
175
|
|
181
|
-
|
176
|
+
uni_str
|
182
177
|
end
|
183
178
|
|
184
179
|
# from the original python, yeah, apparently
|
@@ -192,7 +187,6 @@ module MARC
|
|
192
187
|
def unichr(code_point)
|
193
188
|
[code_point].pack("U")
|
194
189
|
end
|
195
|
-
|
196
190
|
end
|
197
191
|
end
|
198
192
|
end
|