marc 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +102 -30
- data/Gemfile +15 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +117 -123
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +80 -9
@@ -1,9 +1,7 @@
|
|
1
|
-
|
1
|
+
require "test/unit"
|
2
|
+
require "marc"
|
2
3
|
|
3
|
-
require
|
4
|
-
require 'marc'
|
5
|
-
|
6
|
-
require 'stringio'
|
4
|
+
require "stringio"
|
7
5
|
|
8
6
|
# Testing char encodings under 1.9, don't bother running
|
9
7
|
# these tests except under 1.9, will either fail (because
|
@@ -11,46 +9,44 @@ require 'stringio'
|
|
11
9
|
# (becuase the func they are testing is no-op on 1.9).
|
12
10
|
|
13
11
|
if "".respond_to?(:encoding)
|
14
|
-
|
12
|
+
|
15
13
|
class ReaderCharEncodingsTest < Test::Unit::TestCase
|
16
14
|
####
|
17
15
|
# Helper methods for our tests
|
18
16
|
#
|
19
17
|
####
|
20
|
-
|
21
|
-
|
22
|
-
@@utf_marc_path = 'test/utf8.marc'
|
18
|
+
|
19
|
+
@@utf_marc_path = "test/utf8.marc"
|
23
20
|
# tests against record at test/utf8.marc
|
24
21
|
def assert_utf8_right_in_utf8(record)
|
25
|
-
assert_equal "UTF-8", record[
|
26
|
-
|
27
|
-
assert_equal "UTF-8", record[
|
28
|
-
|
29
|
-
assert_equal "UTF-8", record[
|
30
|
-
assert_equal "UTF-8", record[
|
31
|
-
|
32
|
-
assert_equal "UTF-8", record[
|
33
|
-
assert record[
|
22
|
+
assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
|
23
|
+
|
24
|
+
assert_equal "UTF-8", record["245"].to_s.encoding.name
|
25
|
+
|
26
|
+
assert_equal "UTF-8", record["245"].subfields.first.to_s.encoding.name
|
27
|
+
assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
|
28
|
+
|
29
|
+
assert_equal "UTF-8", record["245"]["a"].encoding.name
|
30
|
+
assert record["245"]["a"].start_with?("Photčhanānukrom")
|
34
31
|
end
|
35
|
-
|
36
|
-
# Test against multirecord just to be sure that works.
|
32
|
+
|
33
|
+
# Test against multirecord just to be sure that works.
|
37
34
|
# the multirecord file is just two concatenated copies
|
38
|
-
# of the single one.
|
35
|
+
# of the single one.
|
39
36
|
@@cp866_marc_path = "test/cp866_multirecord.marc"
|
40
37
|
# assumes record in test/cp866_unimarc.marc
|
41
38
|
# Pass in an encoding name, using ruby's canonical name!
|
42
|
-
# "IBM866" not "cp866". "UTF-8".
|
39
|
+
# "IBM866" not "cp866". "UTF-8".
|
43
40
|
def assert_cp866_right(record, encoding = "IBM866")
|
44
|
-
assert_equal(encoding, record[
|
45
|
-
assert_equal(["d09d"], record[
|
41
|
+
assert_equal(encoding, record["001"].value.encoding.name)
|
42
|
+
assert_equal(["d09d"], record["001"].value.encode("UTF-8").unpack("H4")) # russian capital N
|
46
43
|
end
|
47
44
|
|
48
45
|
@@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
|
49
|
-
|
50
46
|
|
51
|
-
def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
|
47
|
+
def assert_all_values_valid_encoding(record, encoding_name = "UTF-8")
|
52
48
|
record.fields.each do |field|
|
53
|
-
if field.
|
49
|
+
if field.is_a? MARC::DataField
|
54
50
|
field.subfields.each do |sf|
|
55
51
|
assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
|
56
52
|
assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
|
@@ -65,148 +61,140 @@ if "".respond_to?(:encoding)
|
|
65
61
|
####
|
66
62
|
# end helper methods
|
67
63
|
####
|
68
|
-
|
69
|
-
|
64
|
+
|
70
65
|
def test_unicode_load
|
71
66
|
reader = MARC::Reader.new(@@utf_marc_path)
|
72
|
-
|
67
|
+
|
73
68
|
record = nil
|
74
|
-
|
69
|
+
|
75
70
|
assert_nothing_raised { record = reader.first }
|
76
|
-
|
71
|
+
|
77
72
|
assert_utf8_right_in_utf8(record)
|
78
73
|
end
|
79
|
-
|
80
|
-
|
74
|
+
|
81
75
|
def test_unicode_decode_forgiving
|
82
76
|
# two kinds of forgiving invocation, they shouldn't be different,
|
83
77
|
# but just in case they have slightly different code paths, test em
|
84
|
-
# too.
|
85
|
-
marc_string = File.
|
86
|
-
record = MARC::Reader.decode(marc_string, :
|
78
|
+
# too.
|
79
|
+
marc_string = File.read(@@utf_marc_path).force_encoding("utf-8")
|
80
|
+
record = MARC::Reader.decode(marc_string, forgiving: true)
|
87
81
|
assert_utf8_right_in_utf8(record)
|
88
82
|
|
89
|
-
|
90
83
|
reader = MARC::ForgivingReader.new(@@utf_marc_path)
|
91
84
|
record = reader.first
|
92
85
|
assert_utf8_right_in_utf8(record)
|
93
86
|
end
|
94
|
-
|
87
|
+
|
95
88
|
def test_unicode_forgiving_reader_passes_options
|
96
89
|
# Make sure ForgivingReader accepts same options as MARC::Reader
|
97
90
|
# We don't test them ALL though, just a sample.
|
98
|
-
# Tell it we're reading cp866, but trancode to utf8 for us.
|
99
|
-
reader = MARC::ForgivingReader.new(@@cp866_marc_path, :
|
91
|
+
# Tell it we're reading cp866, but trancode to utf8 for us.
|
92
|
+
reader = MARC::ForgivingReader.new(@@cp866_marc_path, external_encoding: "cp866", internal_encoding: "utf-8")
|
100
93
|
|
101
|
-
record = reader.first
|
94
|
+
record = reader.first
|
102
95
|
|
103
96
|
assert_cp866_right(record, "UTF-8")
|
104
97
|
end
|
105
|
-
|
98
|
+
|
106
99
|
def test_explicit_encoding
|
107
|
-
reader = MARC::Reader.new(@@cp866_marc_path, :
|
108
|
-
|
100
|
+
reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
|
101
|
+
|
109
102
|
assert_cp866_right(reader.first, "IBM866")
|
110
103
|
end
|
111
|
-
|
104
|
+
|
112
105
|
def test_bad_encoding_name_input
|
113
|
-
reader = MARC::Reader.new(@@cp866_marc_path, :
|
106
|
+
reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "adadfadf")
|
114
107
|
assert_raises ArgumentError do
|
115
108
|
reader.first
|
116
109
|
end
|
117
110
|
end
|
118
|
-
|
111
|
+
|
119
112
|
def test_marc8_with_binary
|
120
|
-
# Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
|
121
|
-
reader = MARC::Reader.new(
|
113
|
+
# Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
|
114
|
+
reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "binary")
|
122
115
|
record = reader.first
|
123
|
-
|
124
|
-
assert_equal "ASCII-8BIT", record[
|
116
|
+
|
117
|
+
assert_equal "ASCII-8BIT", record["100"].subfields.first.value.encoding.name
|
125
118
|
end
|
126
119
|
|
127
120
|
def test_marc8_converted_to_unicode
|
128
|
-
reader = MARC::Reader.new(
|
121
|
+
reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "MARC-8")
|
129
122
|
record = reader.first
|
130
123
|
|
131
124
|
assert_all_values_valid_encoding(record)
|
132
125
|
|
133
|
-
assert_equal "Serreau, Geneviève.", record[
|
126
|
+
assert_equal "Serreau, Geneviève.", record["100"]["a"]
|
134
127
|
end
|
135
128
|
|
136
129
|
def test_marc8_converted_to_unicode_with_file_handle
|
137
130
|
# had some trouble with this one, let's ensure it with a test
|
138
|
-
file
|
139
|
-
reader
|
140
|
-
record
|
131
|
+
file = File.new("test/marc8_accented_chars.marc")
|
132
|
+
reader = MARC::Reader.new(file, external_encoding: "MARC-8")
|
133
|
+
record = reader.first
|
141
134
|
|
142
135
|
assert_all_values_valid_encoding(record)
|
143
136
|
end
|
144
137
|
|
145
138
|
def test_marc8_with_char_entity
|
146
|
-
reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :
|
139
|
+
reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", external_encoding: "MARC-8")
|
147
140
|
record = reader.first
|
148
141
|
|
149
142
|
assert_all_values_valid_encoding(record)
|
150
143
|
|
151
|
-
assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record[
|
144
|
+
assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record["260"]["a"]
|
152
145
|
end
|
153
146
|
|
154
147
|
def test_bad_marc8_raises
|
155
148
|
assert_raise(Encoding::InvalidByteSequenceError) do
|
156
|
-
reader = MARC::Reader.new(@@bad_marc8_path, :
|
157
|
-
|
149
|
+
reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8")
|
150
|
+
reader.first
|
158
151
|
end
|
159
152
|
end
|
160
153
|
|
161
154
|
def test_bad_marc8_with_replacement
|
162
|
-
reader = MARC::Reader.new(@@bad_marc8_path, :
|
155
|
+
reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8", invalid: :replace, replace: "[?]")
|
163
156
|
record = reader.first
|
164
157
|
|
165
|
-
assert_all_values_valid_encoding(record)
|
166
|
-
|
167
|
-
assert record['880']['a'].include?("[?]"), "includes specified replacement string"
|
168
|
-
end
|
158
|
+
assert_all_values_valid_encoding(record)
|
169
159
|
|
160
|
+
assert record["880"]["a"].include?("[?]"), "includes specified replacement string"
|
161
|
+
end
|
170
162
|
|
171
163
|
def test_load_file_opened_with_external_encoding
|
172
|
-
reader = MARC::Reader.new(File.open(@@cp866_marc_path,
|
173
|
-
|
174
|
-
record = reader.first
|
164
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
|
165
|
+
|
166
|
+
record = reader.first
|
175
167
|
# Make sure it's got the encoding it's supposed to.
|
176
|
-
|
177
|
-
assert_cp866_right(record, "IBM866")
|
168
|
+
|
169
|
+
assert_cp866_right(record, "IBM866")
|
178
170
|
end
|
179
|
-
|
171
|
+
|
180
172
|
def test_explicit_encoding_beats_file_encoding
|
181
|
-
reader = MARC::Reader.new(File.open(@@cp866_marc_path,
|
182
|
-
|
173
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:utf-8"), external_encoding: "cp866")
|
174
|
+
|
183
175
|
record = reader.first
|
184
|
-
|
185
|
-
assert_cp866_right(record, "IBM866")
|
176
|
+
|
177
|
+
assert_cp866_right(record, "IBM866")
|
186
178
|
end
|
187
|
-
|
179
|
+
|
188
180
|
def test_from_string_with_utf8_encoding
|
189
181
|
marc_file = File.open(@@utf_marc_path)
|
190
|
-
|
191
|
-
reader = MARC::Reader.new(marc_file)
|
192
|
-
record = reader.first
|
193
|
-
|
194
|
-
|
195
|
-
|
196
182
|
|
183
|
+
reader = MARC::Reader.new(marc_file)
|
184
|
+
reader.first
|
197
185
|
end
|
198
186
|
|
199
187
|
# Something that was failing in my client Blacklight code,
|
200
188
|
# bad bytes should be handled appropriately
|
201
189
|
def test_from_string_utf8_with_bad_byte
|
202
|
-
marc_file = File.open(
|
203
|
-
|
204
|
-
reader = MARC::Reader.new(marc_file, :
|
190
|
+
marc_file = File.open("test/marc_with_bad_utf8.utf8.marc")
|
191
|
+
|
192
|
+
reader = MARC::Reader.new(marc_file, invalid: :replace)
|
205
193
|
|
206
194
|
record = reader.first
|
207
195
|
|
208
196
|
record.fields.each do |field|
|
209
|
-
if field.
|
197
|
+
if field.is_a? MARC::ControlField
|
210
198
|
assert_equal "UTF-8", field.value.encoding.name
|
211
199
|
assert field.value.valid_encoding?
|
212
200
|
else
|
@@ -217,129 +205,121 @@ if "".respond_to?(:encoding)
|
|
217
205
|
end
|
218
206
|
end
|
219
207
|
|
220
|
-
assert record[
|
208
|
+
assert record["520"]["a"].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
|
221
209
|
end
|
222
|
-
|
210
|
+
|
223
211
|
def test_from_string_with_cp866
|
224
|
-
marc_string = File.
|
225
|
-
|
212
|
+
marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
|
213
|
+
|
226
214
|
reader = MARC::Reader.new(StringIO.new(marc_string))
|
227
215
|
record = reader.first
|
228
|
-
|
229
|
-
assert_cp866_right(record, "IBM866")
|
216
|
+
|
217
|
+
assert_cp866_right(record, "IBM866")
|
230
218
|
end
|
231
|
-
|
219
|
+
|
232
220
|
def test_decode_from_string_with_cp866
|
233
|
-
marc_string = File.
|
234
|
-
|
221
|
+
marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
|
222
|
+
|
235
223
|
record = MARC::Reader.decode(marc_string)
|
236
|
-
|
237
|
-
assert_cp866_right(record, "IBM866")
|
224
|
+
|
225
|
+
assert_cp866_right(record, "IBM866")
|
238
226
|
end
|
239
|
-
|
227
|
+
|
240
228
|
def test_with_transcode
|
241
|
-
reader = MARC::Reader.new(@@cp866_marc_path,
|
242
|
-
:
|
243
|
-
:
|
244
|
-
|
245
|
-
record = reader.first
|
246
|
-
|
247
|
-
assert_cp866_right(record, "UTF-8")
|
248
|
-
|
229
|
+
reader = MARC::Reader.new(@@cp866_marc_path,
|
230
|
+
external_encoding: "cp866",
|
231
|
+
internal_encoding: "UTF-8")
|
232
|
+
|
233
|
+
record = reader.first
|
234
|
+
|
235
|
+
assert_cp866_right(record, "UTF-8")
|
249
236
|
end
|
250
|
-
|
237
|
+
|
251
238
|
def test_with_binary_filehandle
|
252
239
|
# about to recommend this as a foolproof way to avoid
|
253
240
|
# ruby transcoding behind your back in docs, let's make
|
254
|
-
# sure it really works.
|
255
|
-
reader = MARC::Reader.new(File.open(@@cp866_marc_path, :
|
256
|
-
:
|
257
|
-
|
241
|
+
# sure it really works.
|
242
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, external_encoding: "binary", internal_encoding: "binary"),
|
243
|
+
external_encoding: "IBM866")
|
244
|
+
|
258
245
|
record = reader.first
|
259
246
|
assert_cp866_right(record, "IBM866")
|
260
247
|
end
|
261
|
-
|
248
|
+
|
262
249
|
def test_with_bad_source_bytes
|
263
|
-
reader = MARC::Reader.new(
|
264
|
-
:
|
265
|
-
:
|
266
|
-
|
250
|
+
reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
|
251
|
+
external_encoding: "UTF-8",
|
252
|
+
validate_encoding: true)
|
253
|
+
|
267
254
|
assert_raise Encoding::InvalidByteSequenceError do
|
268
|
-
|
255
|
+
reader.first
|
269
256
|
end
|
270
257
|
end
|
271
|
-
|
258
|
+
|
272
259
|
def test_bad_source_bytes_with_replace
|
273
|
-
reader = MARC::Reader.new(
|
274
|
-
:
|
275
|
-
|
260
|
+
reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
|
261
|
+
external_encoding: "UTF-8", invalid: :replace)
|
262
|
+
|
276
263
|
record = nil
|
277
264
|
assert_nothing_raised do
|
278
265
|
record = reader.first
|
279
266
|
end
|
280
|
-
|
267
|
+
|
281
268
|
# it should have the unicode replacement char where the bad
|
282
|
-
# byte was.
|
283
|
-
assert_match
|
269
|
+
# byte was.
|
270
|
+
assert_match "=> " + "\uFFFD" + "( <=", record["245"]["a"]
|
284
271
|
end
|
285
|
-
|
272
|
+
|
286
273
|
def test_bad_source_bytes_with_custom_replace
|
287
|
-
reader = MARC::Reader.new(
|
288
|
-
:
|
289
|
-
|
274
|
+
reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
|
275
|
+
external_encoding: "UTF-8", invalid: :replace, replace: "")
|
276
|
+
|
290
277
|
record = reader.first
|
291
|
-
|
292
|
-
# bad byte replaced with empty string, gone.
|
293
|
-
assert_match
|
294
|
-
|
278
|
+
|
279
|
+
# bad byte replaced with empty string, gone.
|
280
|
+
assert_match "=> ( <=", record["245"]["a"]
|
295
281
|
end
|
296
|
-
|
297
|
-
def test_default_internal_encoding
|
282
|
+
|
283
|
+
def test_default_internal_encoding
|
298
284
|
# Some people WILL be changing their Encoding.default_internal
|
299
|
-
# It's even recommended by wycats
|
285
|
+
# It's even recommended by wycats
|
300
286
|
# http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
|
301
287
|
# This will in some cases make ruby File object trans-code
|
302
288
|
# by default. Trans-coding a serial marc binary can change the
|
303
|
-
# byte count and mess it up.
|
289
|
+
# byte count and mess it up.
|
304
290
|
#
|
305
291
|
# But at present, because of the way the Reader is implemented reading
|
306
292
|
# specific bytecounts, it _works_, although it does not _respect_
|
307
293
|
# Encoding.default_internal. That's the best we can do right now,
|
308
|
-
# thsi test is important to ensure it stays at least this good.
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
end
|
294
|
+
# thsi test is important to ensure it stays at least this good.
|
295
|
+
|
296
|
+
original = Encoding.default_internal
|
297
|
+
Encoding.default_internal = "UTF-8"
|
298
|
+
|
299
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
|
300
|
+
|
301
|
+
record = reader.first
|
302
|
+
|
303
|
+
assert_cp866_right(record, "IBM866")
|
304
|
+
ensure
|
305
|
+
Encoding.default_internal = original
|
321
306
|
end
|
322
|
-
|
307
|
+
|
323
308
|
def test_default_internal_encoding_with_string_arg
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
Encoding.default_internal = original
|
335
|
-
end
|
309
|
+
original = Encoding.default_internal
|
310
|
+
Encoding.default_internal = "UTF-8"
|
311
|
+
|
312
|
+
reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
|
313
|
+
|
314
|
+
record = reader.first
|
315
|
+
|
316
|
+
assert_cp866_right(record, "IBM866")
|
317
|
+
ensure
|
318
|
+
Encoding.default_internal = original
|
336
319
|
end
|
337
|
-
|
338
320
|
end
|
339
|
-
|
340
|
-
|
341
|
-
|
321
|
+
|
342
322
|
else
|
343
|
-
require
|
344
|
-
|
323
|
+
require "pathname"
|
324
|
+
warn "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
|
345
325
|
end
|