marc 1.0.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +106 -29
- data/Gemfile +15 -0
- data/README.md +240 -47
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +119 -121
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -81
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +99 -87
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +94 -9
@@ -1,9 +1,7 @@
|
|
1
|
-
|
1
|
+
require "test/unit"
|
2
|
+
require "marc"
|
2
3
|
|
3
|
-
require
|
4
|
-
require 'marc'
|
5
|
-
|
6
|
-
require 'stringio'
|
4
|
+
require "stringio"
|
7
5
|
|
8
6
|
# Testing char encodings under 1.9, don't bother running
|
9
7
|
# these tests except under 1.9, will either fail (because
|
@@ -11,46 +9,44 @@ require 'stringio'
|
|
11
9
|
# (becuase the func they are testing is no-op on 1.9).
|
12
10
|
|
13
11
|
if "".respond_to?(:encoding)
|
14
|
-
|
12
|
+
|
15
13
|
class ReaderCharEncodingsTest < Test::Unit::TestCase
|
16
14
|
####
|
17
15
|
# Helper methods for our tests
|
18
16
|
#
|
19
17
|
####
|
20
|
-
|
21
|
-
|
22
|
-
@@utf_marc_path = 'test/utf8.marc'
|
18
|
+
|
19
|
+
@@utf_marc_path = "test/utf8.marc"
|
23
20
|
# tests against record at test/utf8.marc
|
24
21
|
def assert_utf8_right_in_utf8(record)
|
25
|
-
assert_equal "UTF-8", record[
|
26
|
-
|
27
|
-
assert_equal "UTF-8", record[
|
28
|
-
|
29
|
-
assert_equal "UTF-8", record[
|
30
|
-
assert_equal "UTF-8", record[
|
31
|
-
|
32
|
-
assert_equal "UTF-8", record[
|
33
|
-
assert record[
|
22
|
+
assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
|
23
|
+
|
24
|
+
assert_equal "UTF-8", record["245"].to_s.encoding.name
|
25
|
+
|
26
|
+
assert_equal "UTF-8", record["245"].subfields.first.to_s.encoding.name
|
27
|
+
assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
|
28
|
+
|
29
|
+
assert_equal "UTF-8", record["245"]["a"].encoding.name
|
30
|
+
assert record["245"]["a"].start_with?("Photčhanānukrom")
|
34
31
|
end
|
35
|
-
|
36
|
-
# Test against multirecord just to be sure that works.
|
32
|
+
|
33
|
+
# Test against multirecord just to be sure that works.
|
37
34
|
# the multirecord file is just two concatenated copies
|
38
|
-
# of the single one.
|
35
|
+
# of the single one.
|
39
36
|
@@cp866_marc_path = "test/cp866_multirecord.marc"
|
40
37
|
# assumes record in test/cp866_unimarc.marc
|
41
38
|
# Pass in an encoding name, using ruby's canonical name!
|
42
|
-
# "IBM866" not "cp866". "UTF-8".
|
39
|
+
# "IBM866" not "cp866". "UTF-8".
|
43
40
|
def assert_cp866_right(record, encoding = "IBM866")
|
44
|
-
assert_equal(encoding, record[
|
45
|
-
assert_equal(["d09d"], record[
|
41
|
+
assert_equal(encoding, record["001"].value.encoding.name)
|
42
|
+
assert_equal(["d09d"], record["001"].value.encode("UTF-8").unpack("H4")) # russian capital N
|
46
43
|
end
|
47
44
|
|
48
45
|
@@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
|
49
|
-
|
50
46
|
|
51
|
-
def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
|
47
|
+
def assert_all_values_valid_encoding(record, encoding_name = "UTF-8")
|
52
48
|
record.fields.each do |field|
|
53
|
-
if field.
|
49
|
+
if field.is_a? MARC::DataField
|
54
50
|
field.subfields.each do |sf|
|
55
51
|
assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
|
56
52
|
assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
|
@@ -65,148 +61,140 @@ if "".respond_to?(:encoding)
|
|
65
61
|
####
|
66
62
|
# end helper methods
|
67
63
|
####
|
68
|
-
|
69
|
-
|
64
|
+
|
70
65
|
def test_unicode_load
|
71
66
|
reader = MARC::Reader.new(@@utf_marc_path)
|
72
|
-
|
67
|
+
|
73
68
|
record = nil
|
74
|
-
|
69
|
+
|
75
70
|
assert_nothing_raised { record = reader.first }
|
76
|
-
|
71
|
+
|
77
72
|
assert_utf8_right_in_utf8(record)
|
78
73
|
end
|
79
|
-
|
80
|
-
|
74
|
+
|
81
75
|
def test_unicode_decode_forgiving
|
82
76
|
# two kinds of forgiving invocation, they shouldn't be different,
|
83
77
|
# but just in case they have slightly different code paths, test em
|
84
|
-
# too.
|
85
|
-
marc_string = File.
|
86
|
-
record = MARC::Reader.decode(marc_string, :
|
78
|
+
# too.
|
79
|
+
marc_string = File.read(@@utf_marc_path).force_encoding("utf-8")
|
80
|
+
record = MARC::Reader.decode(marc_string, forgiving: true)
|
87
81
|
assert_utf8_right_in_utf8(record)
|
88
82
|
|
89
|
-
|
90
83
|
reader = MARC::ForgivingReader.new(@@utf_marc_path)
|
91
84
|
record = reader.first
|
92
85
|
assert_utf8_right_in_utf8(record)
|
93
86
|
end
|
94
|
-
|
87
|
+
|
95
88
|
def test_unicode_forgiving_reader_passes_options
|
96
89
|
# Make sure ForgivingReader accepts same options as MARC::Reader
|
97
90
|
# We don't test them ALL though, just a sample.
|
98
|
-
# Tell it we're reading cp866, but trancode to utf8 for us.
|
99
|
-
reader = MARC::ForgivingReader.new(@@cp866_marc_path, :
|
91
|
+
# Tell it we're reading cp866, but trancode to utf8 for us.
|
92
|
+
reader = MARC::ForgivingReader.new(@@cp866_marc_path, external_encoding: "cp866", internal_encoding: "utf-8")
|
100
93
|
|
101
|
-
record = reader.first
|
94
|
+
record = reader.first
|
102
95
|
|
103
96
|
assert_cp866_right(record, "UTF-8")
|
104
97
|
end
|
105
|
-
|
98
|
+
|
106
99
|
def test_explicit_encoding
|
107
|
-
reader = MARC::Reader.new(@@cp866_marc_path, :
|
108
|
-
|
100
|
+
reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
|
101
|
+
|
109
102
|
assert_cp866_right(reader.first, "IBM866")
|
110
103
|
end
|
111
|
-
|
104
|
+
|
112
105
|
def test_bad_encoding_name_input
|
113
|
-
reader = MARC::Reader.new(@@cp866_marc_path, :
|
106
|
+
reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "adadfadf")
|
114
107
|
assert_raises ArgumentError do
|
115
108
|
reader.first
|
116
109
|
end
|
117
110
|
end
|
118
|
-
|
111
|
+
|
119
112
|
def test_marc8_with_binary
|
120
|
-
# Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
|
121
|
-
reader = MARC::Reader.new(
|
113
|
+
# Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
|
114
|
+
reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "binary")
|
122
115
|
record = reader.first
|
123
|
-
|
124
|
-
assert_equal "ASCII-8BIT", record[
|
116
|
+
|
117
|
+
assert_equal "ASCII-8BIT", record["100"].subfields.first.value.encoding.name
|
125
118
|
end
|
126
119
|
|
127
120
|
def test_marc8_converted_to_unicode
|
128
|
-
reader = MARC::Reader.new(
|
121
|
+
reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "MARC-8")
|
129
122
|
record = reader.first
|
130
123
|
|
131
124
|
assert_all_values_valid_encoding(record)
|
132
125
|
|
133
|
-
assert_equal "Serreau, Geneviève.", record[
|
126
|
+
assert_equal "Serreau, Geneviève.", record["100"]["a"]
|
134
127
|
end
|
135
128
|
|
136
129
|
def test_marc8_converted_to_unicode_with_file_handle
|
137
130
|
# had some trouble with this one, let's ensure it with a test
|
138
|
-
file
|
139
|
-
reader
|
140
|
-
record
|
131
|
+
file = File.new("test/marc8_accented_chars.marc")
|
132
|
+
reader = MARC::Reader.new(file, external_encoding: "MARC-8")
|
133
|
+
record = reader.first
|
141
134
|
|
142
135
|
assert_all_values_valid_encoding(record)
|
143
136
|
end
|
144
137
|
|
145
138
|
def test_marc8_with_char_entity
|
146
|
-
reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :
|
139
|
+
reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", external_encoding: "MARC-8")
|
147
140
|
record = reader.first
|
148
141
|
|
149
142
|
assert_all_values_valid_encoding(record)
|
150
143
|
|
151
|
-
assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record[
|
144
|
+
assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record["260"]["a"]
|
152
145
|
end
|
153
146
|
|
154
147
|
def test_bad_marc8_raises
|
155
148
|
assert_raise(Encoding::InvalidByteSequenceError) do
|
156
|
-
reader = MARC::Reader.new(@@bad_marc8_path, :
|
157
|
-
|
149
|
+
reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8")
|
150
|
+
reader.first
|
158
151
|
end
|
159
152
|
end
|
160
153
|
|
161
154
|
def test_bad_marc8_with_replacement
|
162
|
-
reader = MARC::Reader.new(@@bad_marc8_path, :
|
155
|
+
reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8", invalid: :replace, replace: "[?]")
|
163
156
|
record = reader.first
|
164
157
|
|
165
|
-
assert_all_values_valid_encoding(record)
|
166
|
-
|
167
|
-
assert record['880']['a'].include?("[?]"), "includes specified replacement string"
|
168
|
-
end
|
158
|
+
assert_all_values_valid_encoding(record)
|
169
159
|
|
160
|
+
assert record["880"]["a"].include?("[?]"), "includes specified replacement string"
|
161
|
+
end
|
170
162
|
|
171
163
|
def test_load_file_opened_with_external_encoding
|
172
|
-
reader = MARC::Reader.new(File.open(@@cp866_marc_path,
|
173
|
-
|
174
|
-
record = reader.first
|
164
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
|
165
|
+
|
166
|
+
record = reader.first
|
175
167
|
# Make sure it's got the encoding it's supposed to.
|
176
|
-
|
177
|
-
assert_cp866_right(record, "IBM866")
|
168
|
+
|
169
|
+
assert_cp866_right(record, "IBM866")
|
178
170
|
end
|
179
|
-
|
171
|
+
|
180
172
|
def test_explicit_encoding_beats_file_encoding
|
181
|
-
reader = MARC::Reader.new(File.open(@@cp866_marc_path,
|
182
|
-
|
173
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:utf-8"), external_encoding: "cp866")
|
174
|
+
|
183
175
|
record = reader.first
|
184
|
-
|
185
|
-
assert_cp866_right(record, "IBM866")
|
176
|
+
|
177
|
+
assert_cp866_right(record, "IBM866")
|
186
178
|
end
|
187
|
-
|
179
|
+
|
188
180
|
def test_from_string_with_utf8_encoding
|
189
181
|
marc_file = File.open(@@utf_marc_path)
|
190
|
-
|
191
|
-
reader = MARC::Reader.new(marc_file)
|
192
|
-
record = reader.first
|
193
|
-
|
194
|
-
|
195
|
-
|
196
182
|
|
183
|
+
reader = MARC::Reader.new(marc_file)
|
184
|
+
reader.first
|
197
185
|
end
|
198
186
|
|
199
187
|
# Something that was failing in my client Blacklight code,
|
200
188
|
# bad bytes should be handled appropriately
|
201
189
|
def test_from_string_utf8_with_bad_byte
|
202
|
-
marc_file = File.open(
|
203
|
-
|
204
|
-
reader = MARC::Reader.new(marc_file, :
|
190
|
+
marc_file = File.open("test/marc_with_bad_utf8.utf8.marc")
|
191
|
+
|
192
|
+
reader = MARC::Reader.new(marc_file, invalid: :replace)
|
205
193
|
|
206
194
|
record = reader.first
|
207
195
|
|
208
196
|
record.fields.each do |field|
|
209
|
-
if field.
|
197
|
+
if field.is_a? MARC::ControlField
|
210
198
|
assert_equal "UTF-8", field.value.encoding.name
|
211
199
|
assert field.value.valid_encoding?
|
212
200
|
else
|
@@ -217,129 +205,121 @@ if "".respond_to?(:encoding)
|
|
217
205
|
end
|
218
206
|
end
|
219
207
|
|
220
|
-
assert record[
|
208
|
+
assert record["520"]["a"].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
|
221
209
|
end
|
222
|
-
|
210
|
+
|
223
211
|
def test_from_string_with_cp866
|
224
|
-
marc_string = File.
|
225
|
-
|
212
|
+
marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
|
213
|
+
|
226
214
|
reader = MARC::Reader.new(StringIO.new(marc_string))
|
227
215
|
record = reader.first
|
228
|
-
|
229
|
-
assert_cp866_right(record, "IBM866")
|
216
|
+
|
217
|
+
assert_cp866_right(record, "IBM866")
|
230
218
|
end
|
231
|
-
|
219
|
+
|
232
220
|
def test_decode_from_string_with_cp866
|
233
|
-
marc_string = File.
|
234
|
-
|
221
|
+
marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
|
222
|
+
|
235
223
|
record = MARC::Reader.decode(marc_string)
|
236
|
-
|
237
|
-
assert_cp866_right(record, "IBM866")
|
224
|
+
|
225
|
+
assert_cp866_right(record, "IBM866")
|
238
226
|
end
|
239
|
-
|
227
|
+
|
240
228
|
def test_with_transcode
|
241
|
-
reader = MARC::Reader.new(@@cp866_marc_path,
|
242
|
-
:
|
243
|
-
:
|
244
|
-
|
245
|
-
record = reader.first
|
246
|
-
|
247
|
-
assert_cp866_right(record, "UTF-8")
|
248
|
-
|
229
|
+
reader = MARC::Reader.new(@@cp866_marc_path,
|
230
|
+
external_encoding: "cp866",
|
231
|
+
internal_encoding: "UTF-8")
|
232
|
+
|
233
|
+
record = reader.first
|
234
|
+
|
235
|
+
assert_cp866_right(record, "UTF-8")
|
249
236
|
end
|
250
|
-
|
237
|
+
|
251
238
|
def test_with_binary_filehandle
|
252
239
|
# about to recommend this as a foolproof way to avoid
|
253
240
|
# ruby transcoding behind your back in docs, let's make
|
254
|
-
# sure it really works.
|
255
|
-
reader = MARC::Reader.new(File.open(@@cp866_marc_path, :
|
256
|
-
:
|
257
|
-
|
241
|
+
# sure it really works.
|
242
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, external_encoding: "binary", internal_encoding: "binary"),
|
243
|
+
external_encoding: "IBM866")
|
244
|
+
|
258
245
|
record = reader.first
|
259
246
|
assert_cp866_right(record, "IBM866")
|
260
247
|
end
|
261
|
-
|
248
|
+
|
262
249
|
def test_with_bad_source_bytes
|
263
|
-
reader = MARC::Reader.new(
|
264
|
-
:
|
265
|
-
:
|
266
|
-
|
250
|
+
reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
|
251
|
+
external_encoding: "UTF-8",
|
252
|
+
validate_encoding: true)
|
253
|
+
|
267
254
|
assert_raise Encoding::InvalidByteSequenceError do
|
268
|
-
|
255
|
+
reader.first
|
269
256
|
end
|
270
257
|
end
|
271
|
-
|
258
|
+
|
272
259
|
def test_bad_source_bytes_with_replace
|
273
|
-
reader = MARC::Reader.new(
|
274
|
-
:
|
275
|
-
|
260
|
+
reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
|
261
|
+
external_encoding: "UTF-8", invalid: :replace)
|
262
|
+
|
276
263
|
record = nil
|
277
264
|
assert_nothing_raised do
|
278
265
|
record = reader.first
|
279
266
|
end
|
280
|
-
|
267
|
+
|
281
268
|
# it should have the unicode replacement char where the bad
|
282
|
-
# byte was.
|
283
|
-
assert_match
|
269
|
+
# byte was.
|
270
|
+
assert_match "=> " + "\uFFFD" + "( <=", record["245"]["a"]
|
284
271
|
end
|
285
|
-
|
272
|
+
|
286
273
|
def test_bad_source_bytes_with_custom_replace
|
287
|
-
reader = MARC::Reader.new(
|
288
|
-
:
|
289
|
-
|
274
|
+
reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
|
275
|
+
external_encoding: "UTF-8", invalid: :replace, replace: "")
|
276
|
+
|
290
277
|
record = reader.first
|
291
|
-
|
292
|
-
# bad byte replaced with empty string, gone.
|
293
|
-
assert_match
|
294
|
-
|
278
|
+
|
279
|
+
# bad byte replaced with empty string, gone.
|
280
|
+
assert_match "=> ( <=", record["245"]["a"]
|
295
281
|
end
|
296
|
-
|
297
|
-
def test_default_internal_encoding
|
282
|
+
|
283
|
+
def test_default_internal_encoding
|
298
284
|
# Some people WILL be changing their Encoding.default_internal
|
299
|
-
# It's even recommended by wycats
|
285
|
+
# It's even recommended by wycats
|
300
286
|
# http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
|
301
287
|
# This will in some cases make ruby File object trans-code
|
302
288
|
# by default. Trans-coding a serial marc binary can change the
|
303
|
-
# byte count and mess it up.
|
289
|
+
# byte count and mess it up.
|
304
290
|
#
|
305
291
|
# But at present, because of the way the Reader is implemented reading
|
306
292
|
# specific bytecounts, it _works_, although it does not _respect_
|
307
293
|
# Encoding.default_internal. That's the best we can do right now,
|
308
|
-
# thsi test is important to ensure it stays at least this good.
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
end
|
294
|
+
# thsi test is important to ensure it stays at least this good.
|
295
|
+
|
296
|
+
original = Encoding.default_internal
|
297
|
+
Encoding.default_internal = "UTF-8"
|
298
|
+
|
299
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
|
300
|
+
|
301
|
+
record = reader.first
|
302
|
+
|
303
|
+
assert_cp866_right(record, "IBM866")
|
304
|
+
ensure
|
305
|
+
Encoding.default_internal = original
|
321
306
|
end
|
322
|
-
|
307
|
+
|
323
308
|
def test_default_internal_encoding_with_string_arg
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
Encoding.default_internal = original
|
335
|
-
end
|
309
|
+
original = Encoding.default_internal
|
310
|
+
Encoding.default_internal = "UTF-8"
|
311
|
+
|
312
|
+
reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
|
313
|
+
|
314
|
+
record = reader.first
|
315
|
+
|
316
|
+
assert_cp866_right(record, "IBM866")
|
317
|
+
ensure
|
318
|
+
Encoding.default_internal = original
|
336
319
|
end
|
337
|
-
|
338
320
|
end
|
339
|
-
|
340
|
-
|
341
|
-
|
321
|
+
|
342
322
|
else
|
343
|
-
require
|
344
|
-
|
323
|
+
require "pathname"
|
324
|
+
warn "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
|
345
325
|
end
|