marc 1.1.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +116 -30
  7. data/Gemfile +5 -0
  8. data/README.md +239 -46
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -87
  22. data/lib/marc/reader.rb +116 -124
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -82
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +28 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -34
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +101 -94
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +129 -22
@@ -1,9 +1,7 @@
1
- # -*- encoding: utf-8 -*-
1
+ require "test/unit"
2
+ require "marc"
2
3
 
3
- require 'test/unit'
4
- require 'marc'
5
-
6
- require 'stringio'
4
+ require "stringio"
7
5
 
8
6
  # Testing char encodings under 1.9, don't bother running
9
7
  # these tests except under 1.9, will either fail (because
@@ -11,46 +9,44 @@ require 'stringio'
11
9
  # (becuase the func they are testing is no-op on 1.9).
12
10
 
13
11
  if "".respond_to?(:encoding)
14
-
12
+
15
13
  class ReaderCharEncodingsTest < Test::Unit::TestCase
16
14
  ####
17
15
  # Helper methods for our tests
18
16
  #
19
17
  ####
20
-
21
-
22
- @@utf_marc_path = 'test/utf8.marc'
18
+
19
+ @@utf_marc_path = "test/utf8.marc"
23
20
  # tests against record at test/utf8.marc
24
21
  def assert_utf8_right_in_utf8(record)
25
- assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
26
-
27
- assert_equal "UTF-8", record['245'].to_s.encoding.name
28
-
29
- assert_equal "UTF-8", record['245'].subfields.first.to_s.encoding.name
30
- assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
31
-
32
- assert_equal "UTF-8", record['245']['a'].encoding.name
33
- assert record['245']['a'].start_with?("Photčhanānukrom")
22
+ assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
23
+
24
+ assert_equal "UTF-8", record["245"].to_s.encoding.name
25
+
26
+ assert_equal "UTF-8", record["245"].subfields.first.to_s.encoding.name
27
+ assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
28
+
29
+ assert_equal "UTF-8", record["245"]["a"].encoding.name
30
+ assert record["245"]["a"].start_with?("Photčhanānukrom")
34
31
  end
35
-
36
- # Test against multirecord just to be sure that works.
32
+
33
+ # Test against multirecord just to be sure that works.
37
34
  # the multirecord file is just two concatenated copies
38
- # of the single one.
35
+ # of the single one.
39
36
  @@cp866_marc_path = "test/cp866_multirecord.marc"
40
37
  # assumes record in test/cp866_unimarc.marc
41
38
  # Pass in an encoding name, using ruby's canonical name!
42
- # "IBM866" not "cp866". "UTF-8".
39
+ # "IBM866" not "cp866". "UTF-8".
43
40
  def assert_cp866_right(record, encoding = "IBM866")
44
- assert_equal(encoding, record['001'].value.encoding.name)
45
- assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
41
+ assert_equal(encoding, record["001"].value.encoding.name)
42
+ assert_equal(["d09d"], record["001"].value.encode("UTF-8").unpack("H4")) # russian capital N
46
43
  end
47
44
 
48
45
  @@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
49
-
50
46
 
51
- def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
47
+ def assert_all_values_valid_encoding(record, encoding_name = "UTF-8")
52
48
  record.fields.each do |field|
53
- if field.kind_of? MARC::DataField
49
+ if field.is_a? MARC::DataField
54
50
  field.subfields.each do |sf|
55
51
  assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
56
52
  assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
@@ -65,148 +61,140 @@ if "".respond_to?(:encoding)
65
61
  ####
66
62
  # end helper methods
67
63
  ####
68
-
69
-
64
+
70
65
  def test_unicode_load
71
66
  reader = MARC::Reader.new(@@utf_marc_path)
72
-
67
+
73
68
  record = nil
74
-
69
+
75
70
  assert_nothing_raised { record = reader.first }
76
-
71
+
77
72
  assert_utf8_right_in_utf8(record)
78
73
  end
79
-
80
-
74
+
81
75
  def test_unicode_decode_forgiving
82
76
  # two kinds of forgiving invocation, they shouldn't be different,
83
77
  # but just in case they have slightly different code paths, test em
84
- # too.
85
- marc_string = File.open(@@utf_marc_path).read.force_encoding("utf-8")
86
- record = MARC::Reader.decode(marc_string, :forgiving => true)
78
+ # too.
79
+ marc_string = File.read(@@utf_marc_path).force_encoding("utf-8")
80
+ record = MARC::Reader.decode(marc_string, forgiving: true)
87
81
  assert_utf8_right_in_utf8(record)
88
82
 
89
-
90
83
  reader = MARC::ForgivingReader.new(@@utf_marc_path)
91
84
  record = reader.first
92
85
  assert_utf8_right_in_utf8(record)
93
86
  end
94
-
87
+
95
88
  def test_unicode_forgiving_reader_passes_options
96
89
  # Make sure ForgivingReader accepts same options as MARC::Reader
97
90
  # We don't test them ALL though, just a sample.
98
- # Tell it we're reading cp866, but trancode to utf8 for us.
99
- reader = MARC::ForgivingReader.new(@@cp866_marc_path, :external_encoding => "cp866", :internal_encoding => "utf-8")
91
+ # Tell it we're reading cp866, but trancode to utf8 for us.
92
+ reader = MARC::ForgivingReader.new(@@cp866_marc_path, external_encoding: "cp866", internal_encoding: "utf-8")
100
93
 
101
- record = reader.first
94
+ record = reader.first
102
95
 
103
96
  assert_cp866_right(record, "UTF-8")
104
97
  end
105
-
98
+
106
99
  def test_explicit_encoding
107
- reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'cp866')
108
-
100
+ reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
101
+
109
102
  assert_cp866_right(reader.first, "IBM866")
110
103
  end
111
-
104
+
112
105
  def test_bad_encoding_name_input
113
- reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'adadfadf')
106
+ reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "adadfadf")
114
107
  assert_raises ArgumentError do
115
108
  reader.first
116
109
  end
117
110
  end
118
-
111
+
119
112
  def test_marc8_with_binary
120
- # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
121
- reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
113
+ # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
114
+ reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "binary")
122
115
  record = reader.first
123
-
124
- assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
116
+
117
+ assert_equal "ASCII-8BIT", record["100"].subfields.first.value.encoding.name
125
118
  end
126
119
 
127
120
  def test_marc8_converted_to_unicode
128
- reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'MARC-8')
121
+ reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "MARC-8")
129
122
  record = reader.first
130
123
 
131
124
  assert_all_values_valid_encoding(record)
132
125
 
133
- assert_equal "Serreau, Geneviève.", record['100']['a']
126
+ assert_equal "Serreau, Geneviève.", record["100"]["a"]
134
127
  end
135
128
 
136
129
  def test_marc8_converted_to_unicode_with_file_handle
137
130
  # had some trouble with this one, let's ensure it with a test
138
- file = File.new('test/marc8_accented_chars.marc')
139
- reader = MARC::Reader.new(file, :external_encoding => "MARC-8")
140
- record = reader.first
131
+ file = File.new("test/marc8_accented_chars.marc")
132
+ reader = MARC::Reader.new(file, external_encoding: "MARC-8")
133
+ record = reader.first
141
134
 
142
135
  assert_all_values_valid_encoding(record)
143
136
  end
144
137
 
145
138
  def test_marc8_with_char_entity
146
- reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :external_encoding => "MARC-8")
139
+ reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", external_encoding: "MARC-8")
147
140
  record = reader.first
148
141
 
149
142
  assert_all_values_valid_encoding(record)
150
143
 
151
- assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
144
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record["260"]["a"]
152
145
  end
153
146
 
154
147
  def test_bad_marc8_raises
155
148
  assert_raise(Encoding::InvalidByteSequenceError) do
156
- reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8')
157
- record = reader.first
149
+ reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8")
150
+ reader.first
158
151
  end
159
152
  end
160
153
 
161
154
  def test_bad_marc8_with_replacement
162
- reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8', :invalid => :replace, :replace => "[?]")
155
+ reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8", invalid: :replace, replace: "[?]")
163
156
  record = reader.first
164
157
 
165
- assert_all_values_valid_encoding(record)
166
-
167
- assert record['880']['a'].include?("[?]"), "includes specified replacement string"
168
- end
158
+ assert_all_values_valid_encoding(record)
169
159
 
160
+ assert record["880"]["a"].include?("[?]"), "includes specified replacement string"
161
+ end
170
162
 
171
163
  def test_load_file_opened_with_external_encoding
172
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
173
-
174
- record = reader.first
164
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
165
+
166
+ record = reader.first
175
167
  # Make sure it's got the encoding it's supposed to.
176
-
177
- assert_cp866_right(record, "IBM866")
168
+
169
+ assert_cp866_right(record, "IBM866")
178
170
  end
179
-
171
+
180
172
  def test_explicit_encoding_beats_file_encoding
181
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:utf-8'), :external_encoding => "cp866")
182
-
173
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:utf-8"), external_encoding: "cp866")
174
+
183
175
  record = reader.first
184
-
185
- assert_cp866_right(record, "IBM866")
176
+
177
+ assert_cp866_right(record, "IBM866")
186
178
  end
187
-
179
+
188
180
  def test_from_string_with_utf8_encoding
189
181
  marc_file = File.open(@@utf_marc_path)
190
-
191
- reader = MARC::Reader.new(marc_file)
192
- record = reader.first
193
-
194
-
195
-
196
182
 
183
+ reader = MARC::Reader.new(marc_file)
184
+ reader.first
197
185
  end
198
186
 
199
187
  # Something that was failing in my client Blacklight code,
200
188
  # bad bytes should be handled appropriately
201
189
  def test_from_string_utf8_with_bad_byte
202
- marc_file = File.open('test/marc_with_bad_utf8.utf8.marc')
203
-
204
- reader = MARC::Reader.new(marc_file, :invalid => :replace)
190
+ marc_file = File.open("test/marc_with_bad_utf8.utf8.marc")
191
+
192
+ reader = MARC::Reader.new(marc_file, invalid: :replace)
205
193
 
206
194
  record = reader.first
207
195
 
208
196
  record.fields.each do |field|
209
- if field.kind_of? MARC::ControlField
197
+ if field.is_a? MARC::ControlField
210
198
  assert_equal "UTF-8", field.value.encoding.name
211
199
  assert field.value.valid_encoding?
212
200
  else
@@ -217,129 +205,121 @@ if "".respond_to?(:encoding)
217
205
  end
218
206
  end
219
207
 
220
- assert record['520']['a'].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
208
+ assert record["520"]["a"].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
221
209
  end
222
-
210
+
223
211
  def test_from_string_with_cp866
224
- marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
225
-
212
+ marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
213
+
226
214
  reader = MARC::Reader.new(StringIO.new(marc_string))
227
215
  record = reader.first
228
-
229
- assert_cp866_right(record, "IBM866")
216
+
217
+ assert_cp866_right(record, "IBM866")
230
218
  end
231
-
219
+
232
220
  def test_decode_from_string_with_cp866
233
- marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
234
-
221
+ marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
222
+
235
223
  record = MARC::Reader.decode(marc_string)
236
-
237
- assert_cp866_right(record, "IBM866")
224
+
225
+ assert_cp866_right(record, "IBM866")
238
226
  end
239
-
227
+
240
228
  def test_with_transcode
241
- reader = MARC::Reader.new(@@cp866_marc_path,
242
- :external_encoding => 'cp866',
243
- :internal_encoding => 'UTF-8')
244
-
245
- record = reader.first
246
-
247
- assert_cp866_right(record, "UTF-8")
248
-
229
+ reader = MARC::Reader.new(@@cp866_marc_path,
230
+ external_encoding: "cp866",
231
+ internal_encoding: "UTF-8")
232
+
233
+ record = reader.first
234
+
235
+ assert_cp866_right(record, "UTF-8")
249
236
  end
250
-
237
+
251
238
  def test_with_binary_filehandle
252
239
  # about to recommend this as a foolproof way to avoid
253
240
  # ruby transcoding behind your back in docs, let's make
254
- # sure it really works.
255
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, :external_encoding => "binary", :internal_encoding => "binary"),
256
- :external_encoding => "IBM866")
257
-
241
+ # sure it really works.
242
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, external_encoding: "binary", internal_encoding: "binary"),
243
+ external_encoding: "IBM866")
244
+
258
245
  record = reader.first
259
246
  assert_cp866_right(record, "IBM866")
260
247
  end
261
-
248
+
262
249
  def test_with_bad_source_bytes
263
- reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
264
- :external_encoding => "UTF-8",
265
- :validate_encoding => true)
266
-
250
+ reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
251
+ external_encoding: "UTF-8",
252
+ validate_encoding: true)
253
+
267
254
  assert_raise Encoding::InvalidByteSequenceError do
268
- record = reader.first
255
+ reader.first
269
256
  end
270
257
  end
271
-
258
+
272
259
  def test_bad_source_bytes_with_replace
273
- reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
274
- :external_encoding => "UTF-8", :invalid => :replace)
275
-
260
+ reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
261
+ external_encoding: "UTF-8", invalid: :replace)
262
+
276
263
  record = nil
277
264
  assert_nothing_raised do
278
265
  record = reader.first
279
266
  end
280
-
267
+
281
268
  # it should have the unicode replacement char where the bad
282
- # byte was.
283
- assert_match '=> ' + "\uFFFD" + '( <=', record['245']['a']
269
+ # byte was.
270
+ assert_match "=> " + "\uFFFD" + "( <=", record["245"]["a"]
284
271
  end
285
-
272
+
286
273
  def test_bad_source_bytes_with_custom_replace
287
- reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
288
- :external_encoding => "UTF-8", :invalid => :replace, :replace => '')
289
-
274
+ reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
275
+ external_encoding: "UTF-8", invalid: :replace, replace: "")
276
+
290
277
  record = reader.first
291
-
292
- # bad byte replaced with empty string, gone.
293
- assert_match '=> ( <=', record['245']['a']
294
-
278
+
279
+ # bad byte replaced with empty string, gone.
280
+ assert_match "=> ( <=", record["245"]["a"]
295
281
  end
296
-
297
- def test_default_internal_encoding
282
+
283
+ def test_default_internal_encoding
298
284
  # Some people WILL be changing their Encoding.default_internal
299
- # It's even recommended by wycats
285
+ # It's even recommended by wycats
300
286
  # http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
301
287
  # This will in some cases make ruby File object trans-code
302
288
  # by default. Trans-coding a serial marc binary can change the
303
- # byte count and mess it up.
289
+ # byte count and mess it up.
304
290
  #
305
291
  # But at present, because of the way the Reader is implemented reading
306
292
  # specific bytecounts, it _works_, although it does not _respect_
307
293
  # Encoding.default_internal. That's the best we can do right now,
308
- # thsi test is important to ensure it stays at least this good.
309
- begin
310
- original = Encoding.default_internal
311
- Encoding.default_internal = "UTF-8"
312
-
313
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
314
-
315
- record = reader.first
316
-
317
- assert_cp866_right(record, "IBM866")
318
- ensure
319
- Encoding.default_internal = original
320
- end
294
+ # thsi test is important to ensure it stays at least this good.
295
+
296
+ original = Encoding.default_internal
297
+ Encoding.default_internal = "UTF-8"
298
+
299
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
300
+
301
+ record = reader.first
302
+
303
+ assert_cp866_right(record, "IBM866")
304
+ ensure
305
+ Encoding.default_internal = original
321
306
  end
322
-
307
+
323
308
  def test_default_internal_encoding_with_string_arg
324
- begin
325
- original = Encoding.default_internal
326
- Encoding.default_internal = "UTF-8"
327
-
328
- reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => "cp866")
329
-
330
- record = reader.first
331
-
332
- assert_cp866_right(record, "IBM866")
333
- ensure
334
- Encoding.default_internal = original
335
- end
309
+ original = Encoding.default_internal
310
+ Encoding.default_internal = "UTF-8"
311
+
312
+ reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
313
+
314
+ record = reader.first
315
+
316
+ assert_cp866_right(record, "IBM866")
317
+ ensure
318
+ Encoding.default_internal = original
336
319
  end
337
-
338
320
  end
339
-
340
-
341
-
321
+
342
322
  else
343
- require 'pathname'
344
- $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
323
+ require "pathname"
324
+ warn "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
345
325
  end