marc 1.0.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +106 -29
  7. data/Gemfile +15 -0
  8. data/README.md +240 -47
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -86
  22. data/lib/marc/reader.rb +119 -121
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -81
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +23 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -32
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +99 -87
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +94 -9
@@ -1,9 +1,7 @@
1
- # -*- encoding: utf-8 -*-
1
+ require "test/unit"
2
+ require "marc"
2
3
 
3
- require 'test/unit'
4
- require 'marc'
5
-
6
- require 'stringio'
4
+ require "stringio"
7
5
 
8
6
  # Testing char encodings under 1.9, don't bother running
9
7
  # these tests except under 1.9, will either fail (because
@@ -11,46 +9,44 @@ require 'stringio'
11
9
  # (becuase the func they are testing is no-op on 1.9).
12
10
 
13
11
  if "".respond_to?(:encoding)
14
-
12
+
15
13
  class ReaderCharEncodingsTest < Test::Unit::TestCase
16
14
  ####
17
15
  # Helper methods for our tests
18
16
  #
19
17
  ####
20
-
21
-
22
- @@utf_marc_path = 'test/utf8.marc'
18
+
19
+ @@utf_marc_path = "test/utf8.marc"
23
20
  # tests against record at test/utf8.marc
24
21
  def assert_utf8_right_in_utf8(record)
25
- assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
26
-
27
- assert_equal "UTF-8", record['245'].to_s.encoding.name
28
-
29
- assert_equal "UTF-8", record['245'].subfields.first.to_s.encoding.name
30
- assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
31
-
32
- assert_equal "UTF-8", record['245']['a'].encoding.name
33
- assert record['245']['a'].start_with?("Photčhanānukrom")
22
+ assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
23
+
24
+ assert_equal "UTF-8", record["245"].to_s.encoding.name
25
+
26
+ assert_equal "UTF-8", record["245"].subfields.first.to_s.encoding.name
27
+ assert_equal "UTF-8", record["245"].subfields.first.value.encoding.name
28
+
29
+ assert_equal "UTF-8", record["245"]["a"].encoding.name
30
+ assert record["245"]["a"].start_with?("Photčhanānukrom")
34
31
  end
35
-
36
- # Test against multirecord just to be sure that works.
32
+
33
+ # Test against multirecord just to be sure that works.
37
34
  # the multirecord file is just two concatenated copies
38
- # of the single one.
35
+ # of the single one.
39
36
  @@cp866_marc_path = "test/cp866_multirecord.marc"
40
37
  # assumes record in test/cp866_unimarc.marc
41
38
  # Pass in an encoding name, using ruby's canonical name!
42
- # "IBM866" not "cp866". "UTF-8".
39
+ # "IBM866" not "cp866". "UTF-8".
43
40
  def assert_cp866_right(record, encoding = "IBM866")
44
- assert_equal(encoding, record['001'].value.encoding.name)
45
- assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
41
+ assert_equal(encoding, record["001"].value.encoding.name)
42
+ assert_equal(["d09d"], record["001"].value.encode("UTF-8").unpack("H4")) # russian capital N
46
43
  end
47
44
 
48
45
  @@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
49
-
50
46
 
51
- def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
47
+ def assert_all_values_valid_encoding(record, encoding_name = "UTF-8")
52
48
  record.fields.each do |field|
53
- if field.kind_of? MARC::DataField
49
+ if field.is_a? MARC::DataField
54
50
  field.subfields.each do |sf|
55
51
  assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
56
52
  assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
@@ -65,148 +61,140 @@ if "".respond_to?(:encoding)
65
61
  ####
66
62
  # end helper methods
67
63
  ####
68
-
69
-
64
+
70
65
  def test_unicode_load
71
66
  reader = MARC::Reader.new(@@utf_marc_path)
72
-
67
+
73
68
  record = nil
74
-
69
+
75
70
  assert_nothing_raised { record = reader.first }
76
-
71
+
77
72
  assert_utf8_right_in_utf8(record)
78
73
  end
79
-
80
-
74
+
81
75
  def test_unicode_decode_forgiving
82
76
  # two kinds of forgiving invocation, they shouldn't be different,
83
77
  # but just in case they have slightly different code paths, test em
84
- # too.
85
- marc_string = File.open(@@utf_marc_path).read.force_encoding("utf-8")
86
- record = MARC::Reader.decode(marc_string, :forgiving => true)
78
+ # too.
79
+ marc_string = File.read(@@utf_marc_path).force_encoding("utf-8")
80
+ record = MARC::Reader.decode(marc_string, forgiving: true)
87
81
  assert_utf8_right_in_utf8(record)
88
82
 
89
-
90
83
  reader = MARC::ForgivingReader.new(@@utf_marc_path)
91
84
  record = reader.first
92
85
  assert_utf8_right_in_utf8(record)
93
86
  end
94
-
87
+
95
88
  def test_unicode_forgiving_reader_passes_options
96
89
  # Make sure ForgivingReader accepts same options as MARC::Reader
97
90
  # We don't test them ALL though, just a sample.
98
- # Tell it we're reading cp866, but trancode to utf8 for us.
99
- reader = MARC::ForgivingReader.new(@@cp866_marc_path, :external_encoding => "cp866", :internal_encoding => "utf-8")
91
+ # Tell it we're reading cp866, but trancode to utf8 for us.
92
+ reader = MARC::ForgivingReader.new(@@cp866_marc_path, external_encoding: "cp866", internal_encoding: "utf-8")
100
93
 
101
- record = reader.first
94
+ record = reader.first
102
95
 
103
96
  assert_cp866_right(record, "UTF-8")
104
97
  end
105
-
98
+
106
99
  def test_explicit_encoding
107
- reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'cp866')
108
-
100
+ reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
101
+
109
102
  assert_cp866_right(reader.first, "IBM866")
110
103
  end
111
-
104
+
112
105
  def test_bad_encoding_name_input
113
- reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'adadfadf')
106
+ reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "adadfadf")
114
107
  assert_raises ArgumentError do
115
108
  reader.first
116
109
  end
117
110
  end
118
-
111
+
119
112
  def test_marc8_with_binary
120
- # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
121
- reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
113
+ # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
114
+ reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "binary")
122
115
  record = reader.first
123
-
124
- assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
116
+
117
+ assert_equal "ASCII-8BIT", record["100"].subfields.first.value.encoding.name
125
118
  end
126
119
 
127
120
  def test_marc8_converted_to_unicode
128
- reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'MARC-8')
121
+ reader = MARC::Reader.new("test/marc8_accented_chars.marc", external_encoding: "MARC-8")
129
122
  record = reader.first
130
123
 
131
124
  assert_all_values_valid_encoding(record)
132
125
 
133
- assert_equal "Serreau, Geneviève.", record['100']['a']
126
+ assert_equal "Serreau, Geneviève.", record["100"]["a"]
134
127
  end
135
128
 
136
129
  def test_marc8_converted_to_unicode_with_file_handle
137
130
  # had some trouble with this one, let's ensure it with a test
138
- file = File.new('test/marc8_accented_chars.marc')
139
- reader = MARC::Reader.new(file, :external_encoding => "MARC-8")
140
- record = reader.first
131
+ file = File.new("test/marc8_accented_chars.marc")
132
+ reader = MARC::Reader.new(file, external_encoding: "MARC-8")
133
+ record = reader.first
141
134
 
142
135
  assert_all_values_valid_encoding(record)
143
136
  end
144
137
 
145
138
  def test_marc8_with_char_entity
146
- reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :external_encoding => "MARC-8")
139
+ reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", external_encoding: "MARC-8")
147
140
  record = reader.first
148
141
 
149
142
  assert_all_values_valid_encoding(record)
150
143
 
151
- assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
144
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record["260"]["a"]
152
145
  end
153
146
 
154
147
  def test_bad_marc8_raises
155
148
  assert_raise(Encoding::InvalidByteSequenceError) do
156
- reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8')
157
- record = reader.first
149
+ reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8")
150
+ reader.first
158
151
  end
159
152
  end
160
153
 
161
154
  def test_bad_marc8_with_replacement
162
- reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8', :invalid => :replace, :replace => "[?]")
155
+ reader = MARC::Reader.new(@@bad_marc8_path, external_encoding: "MARC-8", invalid: :replace, replace: "[?]")
163
156
  record = reader.first
164
157
 
165
- assert_all_values_valid_encoding(record)
166
-
167
- assert record['880']['a'].include?("[?]"), "includes specified replacement string"
168
- end
158
+ assert_all_values_valid_encoding(record)
169
159
 
160
+ assert record["880"]["a"].include?("[?]"), "includes specified replacement string"
161
+ end
170
162
 
171
163
  def test_load_file_opened_with_external_encoding
172
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
173
-
174
- record = reader.first
164
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
165
+
166
+ record = reader.first
175
167
  # Make sure it's got the encoding it's supposed to.
176
-
177
- assert_cp866_right(record, "IBM866")
168
+
169
+ assert_cp866_right(record, "IBM866")
178
170
  end
179
-
171
+
180
172
  def test_explicit_encoding_beats_file_encoding
181
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:utf-8'), :external_encoding => "cp866")
182
-
173
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:utf-8"), external_encoding: "cp866")
174
+
183
175
  record = reader.first
184
-
185
- assert_cp866_right(record, "IBM866")
176
+
177
+ assert_cp866_right(record, "IBM866")
186
178
  end
187
-
179
+
188
180
  def test_from_string_with_utf8_encoding
189
181
  marc_file = File.open(@@utf_marc_path)
190
-
191
- reader = MARC::Reader.new(marc_file)
192
- record = reader.first
193
-
194
-
195
-
196
182
 
183
+ reader = MARC::Reader.new(marc_file)
184
+ reader.first
197
185
  end
198
186
 
199
187
  # Something that was failing in my client Blacklight code,
200
188
  # bad bytes should be handled appropriately
201
189
  def test_from_string_utf8_with_bad_byte
202
- marc_file = File.open('test/marc_with_bad_utf8.utf8.marc')
203
-
204
- reader = MARC::Reader.new(marc_file, :invalid => :replace)
190
+ marc_file = File.open("test/marc_with_bad_utf8.utf8.marc")
191
+
192
+ reader = MARC::Reader.new(marc_file, invalid: :replace)
205
193
 
206
194
  record = reader.first
207
195
 
208
196
  record.fields.each do |field|
209
- if field.kind_of? MARC::ControlField
197
+ if field.is_a? MARC::ControlField
210
198
  assert_equal "UTF-8", field.value.encoding.name
211
199
  assert field.value.valid_encoding?
212
200
  else
@@ -217,129 +205,121 @@ if "".respond_to?(:encoding)
217
205
  end
218
206
  end
219
207
 
220
- assert record['520']['a'].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
208
+ assert record["520"]["a"].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
221
209
  end
222
-
210
+
223
211
  def test_from_string_with_cp866
224
- marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
225
-
212
+ marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
213
+
226
214
  reader = MARC::Reader.new(StringIO.new(marc_string))
227
215
  record = reader.first
228
-
229
- assert_cp866_right(record, "IBM866")
216
+
217
+ assert_cp866_right(record, "IBM866")
230
218
  end
231
-
219
+
232
220
  def test_decode_from_string_with_cp866
233
- marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
234
-
221
+ marc_string = File.read(@@cp866_marc_path).force_encoding("cp866")
222
+
235
223
  record = MARC::Reader.decode(marc_string)
236
-
237
- assert_cp866_right(record, "IBM866")
224
+
225
+ assert_cp866_right(record, "IBM866")
238
226
  end
239
-
227
+
240
228
  def test_with_transcode
241
- reader = MARC::Reader.new(@@cp866_marc_path,
242
- :external_encoding => 'cp866',
243
- :internal_encoding => 'UTF-8')
244
-
245
- record = reader.first
246
-
247
- assert_cp866_right(record, "UTF-8")
248
-
229
+ reader = MARC::Reader.new(@@cp866_marc_path,
230
+ external_encoding: "cp866",
231
+ internal_encoding: "UTF-8")
232
+
233
+ record = reader.first
234
+
235
+ assert_cp866_right(record, "UTF-8")
249
236
  end
250
-
237
+
251
238
  def test_with_binary_filehandle
252
239
  # about to recommend this as a foolproof way to avoid
253
240
  # ruby transcoding behind your back in docs, let's make
254
- # sure it really works.
255
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, :external_encoding => "binary", :internal_encoding => "binary"),
256
- :external_encoding => "IBM866")
257
-
241
+ # sure it really works.
242
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, external_encoding: "binary", internal_encoding: "binary"),
243
+ external_encoding: "IBM866")
244
+
258
245
  record = reader.first
259
246
  assert_cp866_right(record, "IBM866")
260
247
  end
261
-
248
+
262
249
  def test_with_bad_source_bytes
263
- reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
264
- :external_encoding => "UTF-8",
265
- :validate_encoding => true)
266
-
250
+ reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
251
+ external_encoding: "UTF-8",
252
+ validate_encoding: true)
253
+
267
254
  assert_raise Encoding::InvalidByteSequenceError do
268
- record = reader.first
255
+ reader.first
269
256
  end
270
257
  end
271
-
258
+
272
259
  def test_bad_source_bytes_with_replace
273
- reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
274
- :external_encoding => "UTF-8", :invalid => :replace)
275
-
260
+ reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
261
+ external_encoding: "UTF-8", invalid: :replace)
262
+
276
263
  record = nil
277
264
  assert_nothing_raised do
278
265
  record = reader.first
279
266
  end
280
-
267
+
281
268
  # it should have the unicode replacement char where the bad
282
- # byte was.
283
- assert_match '=> ' + "\uFFFD" + '( <=', record['245']['a']
269
+ # byte was.
270
+ assert_match "=> " + "\uFFFD" + "( <=", record["245"]["a"]
284
271
  end
285
-
272
+
286
273
  def test_bad_source_bytes_with_custom_replace
287
- reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
288
- :external_encoding => "UTF-8", :invalid => :replace, :replace => '')
289
-
274
+ reader = MARC::Reader.new("test/utf8_with_bad_bytes.marc",
275
+ external_encoding: "UTF-8", invalid: :replace, replace: "")
276
+
290
277
  record = reader.first
291
-
292
- # bad byte replaced with empty string, gone.
293
- assert_match '=> ( <=', record['245']['a']
294
-
278
+
279
+ # bad byte replaced with empty string, gone.
280
+ assert_match "=> ( <=", record["245"]["a"]
295
281
  end
296
-
297
- def test_default_internal_encoding
282
+
283
+ def test_default_internal_encoding
298
284
  # Some people WILL be changing their Encoding.default_internal
299
- # It's even recommended by wycats
285
+ # It's even recommended by wycats
300
286
  # http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
301
287
  # This will in some cases make ruby File object trans-code
302
288
  # by default. Trans-coding a serial marc binary can change the
303
- # byte count and mess it up.
289
+ # byte count and mess it up.
304
290
  #
305
291
  # But at present, because of the way the Reader is implemented reading
306
292
  # specific bytecounts, it _works_, although it does not _respect_
307
293
  # Encoding.default_internal. That's the best we can do right now,
308
- # thsi test is important to ensure it stays at least this good.
309
- begin
310
- original = Encoding.default_internal
311
- Encoding.default_internal = "UTF-8"
312
-
313
- reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
314
-
315
- record = reader.first
316
-
317
- assert_cp866_right(record, "IBM866")
318
- ensure
319
- Encoding.default_internal = original
320
- end
294
+ # thsi test is important to ensure it stays at least this good.
295
+
296
+ original = Encoding.default_internal
297
+ Encoding.default_internal = "UTF-8"
298
+
299
+ reader = MARC::Reader.new(File.open(@@cp866_marc_path, "r:cp866"))
300
+
301
+ record = reader.first
302
+
303
+ assert_cp866_right(record, "IBM866")
304
+ ensure
305
+ Encoding.default_internal = original
321
306
  end
322
-
307
+
323
308
  def test_default_internal_encoding_with_string_arg
324
- begin
325
- original = Encoding.default_internal
326
- Encoding.default_internal = "UTF-8"
327
-
328
- reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => "cp866")
329
-
330
- record = reader.first
331
-
332
- assert_cp866_right(record, "IBM866")
333
- ensure
334
- Encoding.default_internal = original
335
- end
309
+ original = Encoding.default_internal
310
+ Encoding.default_internal = "UTF-8"
311
+
312
+ reader = MARC::Reader.new(@@cp866_marc_path, external_encoding: "cp866")
313
+
314
+ record = reader.first
315
+
316
+ assert_cp866_right(record, "IBM866")
317
+ ensure
318
+ Encoding.default_internal = original
336
319
  end
337
-
338
320
  end
339
-
340
-
341
-
321
+
342
322
  else
343
- require 'pathname'
344
- $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
323
+ require "pathname"
324
+ warn "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
345
325
  end