marc 1.0.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +106 -29
- data/Gemfile +15 -0
- data/README.md +240 -47
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +119 -121
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -81
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +99 -87
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +94 -9
@@ -1,8 +1,6 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require
|
4
|
-
require 'marc/marc8/map_to_unicode'
|
5
|
-
require 'unf/normalizer'
|
1
|
+
require "marc"
|
2
|
+
require "marc/marc8/map_to_unicode"
|
3
|
+
require "unf/normalizer"
|
6
4
|
|
7
5
|
module MARC
|
8
6
|
module Marc8
|
@@ -24,8 +22,8 @@ module MARC
|
|
24
22
|
BASIC_LATIN = 0x42
|
25
23
|
ANSEL = 0x45
|
26
24
|
|
27
|
-
G0_SET = [
|
28
|
-
G1_SET = [
|
25
|
+
G0_SET = ["(", ",", "$"]
|
26
|
+
G1_SET = [")", "-", "$"]
|
29
27
|
|
30
28
|
CODESETS = MARC::Marc8::MapToUnicode::CODESETS
|
31
29
|
|
@@ -63,10 +61,9 @@ module MARC
|
|
63
61
|
# it's not already, if it's Marc8 there's no good reason for it not to
|
64
62
|
# be already.
|
65
63
|
def transcode(marc8_string, options = {})
|
66
|
-
invalid_replacement
|
67
|
-
expand_ncr
|
68
|
-
normalization
|
69
|
-
|
64
|
+
invalid_replacement = options.fetch(:replace, "\uFFFD")
|
65
|
+
expand_ncr = options.fetch(:expand_ncr, true)
|
66
|
+
normalization = options.fetch(:normalization, :nfc)
|
70
67
|
|
71
68
|
# don't choke on empty marc8_string
|
72
69
|
return "" if marc8_string.nil? || marc8_string.empty?
|
@@ -82,91 +79,89 @@ module MARC
|
|
82
79
|
combinings = []
|
83
80
|
pos = 0
|
84
81
|
while pos < marc8_string.length
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
end
|
92
|
-
self.g0 = marc8_string[pos+2].ord
|
93
|
-
pos = pos + 3
|
94
|
-
next
|
95
|
-
else
|
96
|
-
# if there aren't enough remaining characters, readd
|
97
|
-
# the escape character so it doesn't get lost; may
|
98
|
-
# help users diagnose problem records
|
99
|
-
uni_list.push marc8_string[pos]
|
100
|
-
pos += 1
|
101
|
-
next
|
102
|
-
end
|
103
|
-
|
104
|
-
elsif G1_SET.include? next_byte
|
105
|
-
if marc8_string[pos+2] == '-' and next_byte == '$'
|
106
|
-
pos += 1
|
107
|
-
end
|
108
|
-
self.g1 = marc8_string[pos+2].ord
|
109
|
-
pos = pos + 3
|
110
|
-
next
|
111
|
-
else
|
112
|
-
charset = next_byte.ord
|
113
|
-
if CODESETS.has_key? charset
|
114
|
-
self.g0 = charset
|
115
|
-
pos += 2
|
116
|
-
elsif charset == 0x73
|
117
|
-
self.g0 = BASIC_LATIN
|
118
|
-
pos += 2
|
119
|
-
if pos == marc8_string.length
|
120
|
-
break
|
121
|
-
end
|
122
|
-
end
|
82
|
+
if marc8_string[pos] == "\x1b"
|
83
|
+
next_byte = marc8_string[pos + 1]
|
84
|
+
if G0_SET.include? next_byte
|
85
|
+
if marc8_string.length >= pos + 3
|
86
|
+
if (marc8_string[pos + 2] == ",") && (next_byte == "$")
|
87
|
+
pos += 1
|
123
88
|
end
|
124
|
-
|
125
|
-
|
126
|
-
mb_flag = is_multibyte(self.g0)
|
127
|
-
|
128
|
-
if mb_flag
|
129
|
-
code_point = (marc8_string[pos].ord * 65536 +
|
130
|
-
marc8_string[pos+1].ord * 256 +
|
131
|
-
marc8_string[pos+2].ord)
|
89
|
+
self.g0 = marc8_string[pos + 2].ord
|
132
90
|
pos += 3
|
133
|
-
|
134
|
-
|
91
|
+
else
|
92
|
+
# if there aren't enough remaining characters, readd
|
93
|
+
# the escape character so it doesn't get lost; may
|
94
|
+
# help users diagnose problem records
|
95
|
+
uni_list.push marc8_string[pos]
|
135
96
|
pos += 1
|
97
|
+
end
|
98
|
+
next
|
99
|
+
elsif G1_SET.include? next_byte
|
100
|
+
if (marc8_string[pos + 2] == "-") && (next_byte == "$")
|
101
|
+
pos += 1
|
102
|
+
end
|
103
|
+
self.g1 = marc8_string[pos + 2].ord
|
104
|
+
pos += 3
|
105
|
+
next
|
106
|
+
else
|
107
|
+
charset = next_byte.ord
|
108
|
+
if CODESETS.has_key? charset
|
109
|
+
self.g0 = charset
|
110
|
+
pos += 2
|
111
|
+
elsif charset == 0x73
|
112
|
+
self.g0 = BASIC_LATIN
|
113
|
+
pos += 2
|
114
|
+
if pos == marc8_string.length
|
115
|
+
break
|
116
|
+
end
|
117
|
+
end
|
136
118
|
end
|
119
|
+
end
|
137
120
|
|
138
|
-
|
139
|
-
(code_point > 0x80 and code_point < 0xa0))
|
140
|
-
uni = unichr(code_point)
|
141
|
-
next
|
142
|
-
end
|
121
|
+
mb_flag = is_multibyte(g0)
|
143
122
|
|
144
|
-
|
145
|
-
|
146
|
-
|
123
|
+
if mb_flag
|
124
|
+
code_point = (marc8_string[pos].ord * 65536 +
|
125
|
+
marc8_string[pos + 1].ord * 256 +
|
126
|
+
marc8_string[pos + 2].ord)
|
127
|
+
pos += 3
|
128
|
+
else
|
129
|
+
code_point = marc8_string[pos].ord
|
130
|
+
pos += 1
|
131
|
+
end
|
147
132
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
133
|
+
if (code_point < 0x20) ||
|
134
|
+
((code_point > 0x80) && (code_point < 0xa0))
|
135
|
+
uni = unichr(code_point)
|
136
|
+
next
|
137
|
+
end
|
138
|
+
|
139
|
+
begin
|
140
|
+
code_set = (code_point > 0x80) && !mb_flag ? g1 : g0
|
141
|
+
(uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
|
142
|
+
|
143
|
+
if cflag
|
144
|
+
combinings.push unichr(uni)
|
145
|
+
else
|
146
|
+
uni_list.push unichr(uni)
|
147
|
+
if combinings.length > 0
|
148
|
+
uni_list.concat combinings
|
149
|
+
combinings = []
|
164
150
|
end
|
165
151
|
end
|
152
|
+
rescue KeyError
|
153
|
+
if options[:invalid] == :replace
|
154
|
+
# Let's coallesece multiple replacements
|
155
|
+
uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
|
156
|
+
pos += 1
|
157
|
+
else
|
158
|
+
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, invalid: :replace, replace: "�")}")
|
159
|
+
end
|
160
|
+
end
|
166
161
|
end
|
167
162
|
|
168
163
|
# what to do if combining chars left over?
|
169
|
-
uni_str = uni_list.join(
|
164
|
+
uni_str = uni_list.join("")
|
170
165
|
|
171
166
|
if expand_ncr
|
172
167
|
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
@@ -178,7 +173,7 @@ module MARC
|
|
178
173
|
uni_str = UNF::Normalizer.normalize(uni_str, normalization)
|
179
174
|
end
|
180
175
|
|
181
|
-
|
176
|
+
uni_str
|
182
177
|
end
|
183
178
|
|
184
179
|
# from the original python, yeah, apparently
|
@@ -192,7 +187,6 @@ module MARC
|
|
192
187
|
def unichr(code_point)
|
193
188
|
[code_point].pack("U")
|
194
189
|
end
|
195
|
-
|
196
190
|
end
|
197
191
|
end
|
198
192
|
end
|