marc 1.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +116 -30
- data/Gemfile +5 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -87
- data/lib/marc/reader.rb +116 -124
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +28 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -34
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +129 -22
data/lib/marc/reader.rb
CHANGED
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'scrub_rb'
|
2
|
-
|
3
1
|
# Note: requiring 'marc/marc8/to_unicode' below, in #initialize,
|
4
2
|
# only when necessary
|
5
3
|
|
6
4
|
module MARC
|
7
|
-
# A class for reading MARC binary (ISO 2709) files.
|
5
|
+
# A class for reading MARC binary (ISO 2709) files.
|
8
6
|
#
|
9
7
|
# == Character Encoding
|
10
8
|
#
|
@@ -12,7 +10,7 @@ module MARC
|
|
12
10
|
# If illegal bytes for that character encoding are encountered in certain
|
13
11
|
# operations, ruby will raise an exception. If a String is incorrectly
|
14
12
|
# tagged with the wrong character encoding, that makes it fairly likely
|
15
|
-
# an illegal byte for the specified encoding will be encountered.
|
13
|
+
# an illegal byte for the specified encoding will be encountered.
|
16
14
|
#
|
17
15
|
# So when reading binary MARC data with the MARC::Reader, it's important
|
18
16
|
# that you let it know the expected encoding:
|
@@ -21,7 +19,7 @@ module MARC
|
|
21
19
|
#
|
22
20
|
# If you leave off 'external_encoding', it will use the ruby environment
|
23
21
|
# Encoding.default_external, which is usually UTF-8 but may depend on your
|
24
|
-
# environment.
|
22
|
+
# environment.
|
25
23
|
#
|
26
24
|
# Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
|
27
25
|
# bytes. By default MARC::Reader will leave these in the produced Strings,
|
@@ -29,58 +27,58 @@ module MARC
|
|
29
27
|
# to catch this early, and ask MARC::Reader to raise immediately on illegal
|
30
28
|
# bytes:
|
31
29
|
#
|
32
|
-
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
30
|
+
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
33
31
|
# :validate_encoding => true)
|
34
32
|
#
|
35
33
|
# Alternately, you can have MARC::Reader replace illegal bytes
|
36
34
|
# with the Unicode Replacement Character, or with a string
|
37
35
|
# of your choice (including the empty string, meaning just omit the bad bytes)
|
38
36
|
#
|
39
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
37
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
40
38
|
# :invalid => :replace)
|
41
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
39
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
42
40
|
# :invalid => :replace, :replace => "")
|
43
41
|
#
|
44
42
|
# If you supply an :external_encoding argument, MARC::Reader will
|
45
43
|
# always assume that encoding -- if you leave it off, MARC::Reader
|
46
44
|
# will use the encoding tagged on any input you pass in, such
|
47
|
-
# as Strings or File handles.
|
45
|
+
# as Strings or File handles.
|
48
46
|
#
|
49
47
|
# # marc data will have same encoding as string.encoding:
|
50
48
|
# MARC::Reader.decode( string )
|
51
49
|
#
|
52
50
|
# # Same, values will have encoding of string.encoding:
|
53
|
-
# MARC::Reader.new(StringIO.new(string))
|
51
|
+
# MARC::Reader.new(StringIO.new(string))
|
54
52
|
#
|
55
53
|
# # data values will have cp866 encoding, per external_encoding of
|
56
54
|
# # File object passed in
|
57
55
|
# MARC::Reader.new(File.new("myfile.marc", "r:cp866"))
|
58
56
|
#
|
59
57
|
# # explicitly tell MARC::Reader the encoding
|
60
|
-
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
58
|
+
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
61
59
|
#
|
62
60
|
# === MARC-8
|
63
61
|
#
|
64
62
|
# The legacy MARC-8 encoding needs to be handled differently, because
|
65
|
-
# there is no built-in support in ruby for MARC-8.
|
63
|
+
# there is no built-in support in ruby for MARC-8.
|
66
64
|
#
|
67
65
|
# You _can_ specify "MARC-8" as an external encoding. It will trigger
|
68
|
-
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
66
|
+
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
69
67
|
#
|
70
68
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
|
71
69
|
#
|
72
70
|
# For external_encoding "MARC-8", :validate_encoding is always true,
|
73
71
|
# there's no way to ignore bad bytes in MARC-8 when transcoding to
|
74
|
-
# unicode. However, just as with other encodings, the
|
72
|
+
# unicode. However, just as with other encodings, the
|
75
73
|
# `:invalid => :replace` and `:replace => "string"`
|
76
|
-
# options can be used to replace bad bytes instead of raising.
|
74
|
+
# options can be used to replace bad bytes instead of raising.
|
77
75
|
#
|
78
76
|
# If you want your MARC-8 to be transcoded internally to something
|
79
77
|
# other than UTF-8, you can use the :internal_encoding option
|
80
|
-
# which works with any encoding in MARC::Reader.
|
78
|
+
# which works with any encoding in MARC::Reader.
|
81
79
|
#
|
82
|
-
# MARC::Reader.new("marc8.mrc",
|
83
|
-
# :external_encoding => "MARC-8",
|
80
|
+
# MARC::Reader.new("marc8.mrc",
|
81
|
+
# :external_encoding => "MARC-8",
|
84
82
|
# :internal_encoding => "UTF-16LE")
|
85
83
|
#
|
86
84
|
# If you want to read in MARC-8 without transcoding, leaving the
|
@@ -90,48 +88,48 @@ module MARC
|
|
90
88
|
#
|
91
89
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
|
92
90
|
#
|
93
|
-
# Please note that MARC::Reader does _not_ currently have any facilities
|
94
|
-
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
91
|
+
# Please note that MARC::Reader does _not_ currently have any facilities
|
92
|
+
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
95
93
|
#
|
96
94
|
# === Complete Encoding Options
|
97
95
|
#
|
98
96
|
# These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
|
99
97
|
# to specify external encoding, ask for a transcode to a different
|
100
|
-
# encoding on read, or validate or replace bad bytes in source.
|
98
|
+
# encoding on read, or validate or replace bad bytes in source.
|
101
99
|
#
|
102
100
|
# [:external_encoding]
|
103
101
|
# What encoding to consider the MARC record's values to be in. This option
|
104
|
-
# takes precedence over the File handle or String argument's encodings.
|
102
|
+
# takes precedence over the File handle or String argument's encodings.
|
105
103
|
# [:internal_encoding]
|
106
104
|
# Ask MARC::Reader to transcode to this encoding in memory after reading
|
107
|
-
# the file in.
|
105
|
+
# the file in.
|
108
106
|
# [:validate_encoding]
|
109
107
|
# If you pass in `true`, MARC::Reader will promise to raise an Encoding::InvalidByteSequenceError
|
110
108
|
# if there are illegal bytes in the source for the :external_encoding. There is
|
111
109
|
# a performance penalty for this check. Without this option, an exception
|
112
|
-
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
110
|
+
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
113
111
|
# what class the exception has) may change in future ruby-marc versions
|
114
|
-
# without warning.
|
112
|
+
# without warning.
|
115
113
|
# [:invalid]
|
116
114
|
# Just like String#encode, set to :replace and any bytes in source data
|
117
|
-
# illegal for the source encoding will be replaced with the unicode
|
115
|
+
# illegal for the source encoding will be replaced with the unicode
|
118
116
|
# replacement character (when in unicode encodings), or else '?'. Overrides
|
119
117
|
# :validate_encoding. This can help you sanitize your input and
|
120
|
-
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
118
|
+
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
121
119
|
# [:replace]
|
122
120
|
# Just like String#encode, combine with `:invalid=>:replace`, set
|
123
121
|
# your own replacement string for invalid bytes. You may use the
|
124
|
-
# empty string to simply eliminate invalid bytes.
|
122
|
+
# empty string to simply eliminate invalid bytes.
|
125
123
|
#
|
126
124
|
# === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
|
127
125
|
#
|
128
|
-
# Be careful with using an explicit File object with the File's own
|
129
|
-
# :internal_encoding set -- it can cause ruby to transcode your data
|
130
|
-
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
126
|
+
# Be careful with using an explicit File object with the File's own
|
127
|
+
# :internal_encoding set -- it can cause ruby to transcode your data
|
128
|
+
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
131
129
|
# marc record unreadable in some cases. This
|
132
130
|
# applies to Encoding.default_encoding too!
|
133
131
|
#
|
134
|
-
# # May in some cases result in unreadable marc and an exception
|
132
|
+
# # May in some cases result in unreadable marc and an exception
|
135
133
|
# MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:cp866:utf-8") )
|
136
134
|
#
|
137
135
|
# # May in some cases result in unreadable marc and an exception
|
@@ -156,7 +154,7 @@ module MARC
|
|
156
154
|
# https://jira.codehaus.org/browse/JRUBY-6637
|
157
155
|
#
|
158
156
|
# We recommend using the latest version of jruby, especially
|
159
|
-
# at least jruby 1.7.6.
|
157
|
+
# at least jruby 1.7.6.
|
160
158
|
class Reader
|
161
159
|
include Enumerable
|
162
160
|
|
@@ -182,43 +180,42 @@ module MARC
|
|
182
180
|
#
|
183
181
|
# Also, if your data encoded with non ascii/utf-8 encoding
|
184
182
|
# (for ex. when reading RUSMARC data) and you use ruby 1.9
|
185
|
-
# you can specify source data encoding with an option.
|
183
|
+
# you can specify source data encoding with an option.
|
186
184
|
#
|
187
185
|
# reader = MARC::Reader.new('marc.dat', :external_encoding => 'cp866')
|
188
186
|
#
|
189
187
|
# or, you can pass IO, opened in the corresponding encoding
|
190
188
|
#
|
191
189
|
# reader = MARC::Reader.new(File.new('marc.dat', 'r:cp866'))
|
192
|
-
def initialize(file, options = {})
|
190
|
+
def initialize(file, options = {})
|
193
191
|
@encoding_options = {}
|
194
192
|
# all can be nil
|
195
193
|
[:internal_encoding, :external_encoding, :invalid, :replace, :validate_encoding].each do |key|
|
196
194
|
@encoding_options[key] = options[key] if options.has_key?(key)
|
197
195
|
end
|
198
|
-
|
199
|
-
if file.is_a?(String)
|
196
|
+
|
197
|
+
if file.is_a?(String)
|
200
198
|
@handle = File.new(file)
|
201
|
-
elsif file.respond_to?(
|
199
|
+
elsif file.respond_to?(:read, 5)
|
202
200
|
@handle = file
|
203
201
|
else
|
204
202
|
raise ArgumentError, "must pass in path or file"
|
205
203
|
end
|
206
|
-
|
207
|
-
if (
|
204
|
+
|
205
|
+
if (!@encoding_options[:external_encoding]) && @handle.respond_to?(:external_encoding)
|
208
206
|
# use file encoding only if we didn't already have an explicit one,
|
209
|
-
# explicit one takes precedence.
|
207
|
+
# explicit one takes precedence.
|
210
208
|
#
|
211
209
|
# Note, please don't use ruby's own internal_encoding transcode
|
212
210
|
# with binary marc data, the transcode can mess up the byte count
|
213
|
-
# and make it unreadable.
|
211
|
+
# and make it unreadable.
|
214
212
|
@encoding_options[:external_encoding] ||= @handle.external_encoding
|
215
213
|
end
|
216
214
|
|
217
215
|
# Only pull in the MARC8 translation if we need it, since it's really big
|
218
|
-
if @encoding_options[:external_encoding]
|
219
|
-
require
|
216
|
+
if @encoding_options[:external_encoding] == "MARC-8"
|
217
|
+
require "marc/marc8/to_unicode" unless defined? MARC::Marc8::ToUnicode
|
220
218
|
end
|
221
|
-
|
222
219
|
end
|
223
220
|
|
224
221
|
# to support iteration:
|
@@ -226,13 +223,13 @@ module MARC
|
|
226
223
|
# print record
|
227
224
|
# end
|
228
225
|
def each
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
self.each_raw do |raw|
|
233
|
-
record = self.decode(raw)
|
226
|
+
if block_given?
|
227
|
+
each_raw do |raw|
|
228
|
+
record = decode(raw)
|
234
229
|
yield record
|
235
230
|
end
|
231
|
+
else
|
232
|
+
enum_for(:each)
|
236
233
|
end
|
237
234
|
end
|
238
235
|
|
@@ -257,10 +254,8 @@ module MARC
|
|
257
254
|
#
|
258
255
|
# If no block is given, an enumerator is returned
|
259
256
|
def each_raw
|
260
|
-
|
261
|
-
|
262
|
-
else
|
263
|
-
while rec_length_s = @handle.read(5)
|
257
|
+
if block_given?
|
258
|
+
while (rec_length_s = @handle.read(5))
|
264
259
|
# make sure the record length looks like an integer
|
265
260
|
rec_length_i = rec_length_s.to_i
|
266
261
|
if rec_length_i == 0
|
@@ -269,9 +264,11 @@ module MARC
|
|
269
264
|
|
270
265
|
# get the raw MARC21 for a record back from the file
|
271
266
|
# using the record length
|
272
|
-
raw = rec_length_s + @handle.read(rec_length_i-5)
|
267
|
+
raw = rec_length_s + @handle.read(rec_length_i - 5)
|
273
268
|
yield raw
|
274
269
|
end
|
270
|
+
else
|
271
|
+
enum_for(:each_raw)
|
275
272
|
end
|
276
273
|
end
|
277
274
|
|
@@ -280,7 +277,7 @@ module MARC
|
|
280
277
|
# Wraps the class method MARC::Reader::decode, using the encoding options of
|
281
278
|
# the MARC::Reader instance.
|
282
279
|
def decode(marc)
|
283
|
-
|
280
|
+
MARC::Reader.decode(marc, @encoding_options)
|
284
281
|
end
|
285
282
|
|
286
283
|
# A static method for turning raw MARC data in transission
|
@@ -288,34 +285,34 @@ module MARC
|
|
288
285
|
# First argument is a String
|
289
286
|
# options include:
|
290
287
|
# [:external_encoding] encoding of MARC record data values
|
291
|
-
# [:forgiving] needs more docs, true is some kind of forgiving
|
292
|
-
# of certain kinds of bad MARC.
|
293
|
-
def self.decode(marc, params={})
|
288
|
+
# [:forgiving] needs more docs, true is some kind of forgiving
|
289
|
+
# of certain kinds of bad MARC.
|
290
|
+
def self.decode(marc, params = {})
|
294
291
|
if params.has_key?(:encoding)
|
295
|
-
|
292
|
+
warn "DEPRECATION WARNING: MARC::Reader.decode :encoding option deprecated, please use :external_encoding"
|
296
293
|
params[:external_encoding] = params.delete(:encoding)
|
297
294
|
end
|
298
|
-
|
299
|
-
if (!
|
295
|
+
|
296
|
+
if (!params.has_key? :external_encoding) && marc.respond_to?(:encoding)
|
300
297
|
# If no forced external_encoding giving, respect the encoding
|
301
|
-
# declared on the string passed in.
|
298
|
+
# declared on the string passed in.
|
302
299
|
params[:external_encoding] = marc.encoding
|
303
300
|
end
|
304
301
|
# And now that we've recorded the current encoding, we force
|
305
302
|
# to binary encoding, because we're going to be doing byte arithmetic,
|
306
|
-
# and want to avoid byte-vs-char confusion.
|
303
|
+
# and want to avoid byte-vs-char confusion.
|
307
304
|
marc.force_encoding("binary") if marc.respond_to?(:force_encoding)
|
308
|
-
|
309
|
-
record = Record.new
|
310
|
-
record.leader = marc[0..LEADER_LENGTH-1]
|
305
|
+
|
306
|
+
record = Record.new
|
307
|
+
record.leader = marc[0..LEADER_LENGTH - 1]
|
311
308
|
|
312
309
|
# where the field data starts
|
313
310
|
base_address = record.leader[12..16].to_i
|
314
311
|
|
315
312
|
# get the byte offsets from the record directory
|
316
|
-
directory = marc[LEADER_LENGTH..base_address-1]
|
313
|
+
directory = marc[LEADER_LENGTH..base_address - 1]
|
317
314
|
|
318
|
-
raise MARC::Exception.new("invalid directory in record") if directory
|
315
|
+
raise MARC::Exception.new("invalid directory in record") if directory.nil?
|
319
316
|
|
320
317
|
# the number of fields in the record corresponds to
|
321
318
|
# how many directory entries there are
|
@@ -324,20 +321,19 @@ module MARC
|
|
324
321
|
# when operating in forgiving mode we just split on end of
|
325
322
|
# field instead of using calculated byte offsets from the
|
326
323
|
# directory
|
327
|
-
if params[:forgiving]
|
324
|
+
if params[:forgiving]
|
328
325
|
marc_field_data = marc[base_address..-1]
|
329
326
|
# It won't let us do the split on bad utf8 data, but
|
330
327
|
# we haven't yet set the 'proper' encoding or used
|
331
328
|
# our correction/replace options. So call it binary for now.
|
332
329
|
marc_field_data.force_encoding("binary") if marc_field_data.respond_to?(:force_encoding)
|
333
|
-
|
330
|
+
|
334
331
|
all_fields = marc_field_data.split(END_OF_FIELD)
|
335
332
|
else
|
336
|
-
mba =
|
333
|
+
mba = marc.bytes.to_a
|
337
334
|
end
|
338
335
|
|
339
|
-
0.upto(num_fields-1) do |field_num|
|
340
|
-
|
336
|
+
0.upto(num_fields - 1) do |field_num|
|
341
337
|
# pull the directory entry for a field out
|
342
338
|
entry_start = field_num * DIRECTORY_ENTRY_LENGTH
|
343
339
|
entry_end = entry_start + DIRECTORY_ENTRY_LENGTH
|
@@ -350,12 +346,12 @@ module MARC
|
|
350
346
|
# if we were told to be forgiving we just use the
|
351
347
|
# next available chuck of field data that we
|
352
348
|
# split apart based on the END_OF_FIELD
|
353
|
-
field_data =
|
349
|
+
field_data = ""
|
354
350
|
if params[:forgiving]
|
355
|
-
field_data = all_fields.shift
|
351
|
+
field_data = all_fields.shift
|
356
352
|
|
357
|
-
|
358
|
-
|
353
|
+
# otherwise we actually use the byte offsets in
|
354
|
+
# directory to figure out what field data to extract
|
359
355
|
else
|
360
356
|
length = entry[3..6].to_i
|
361
357
|
offset = entry[7..11].to_i
|
@@ -366,11 +362,11 @@ module MARC
|
|
366
362
|
|
367
363
|
# remove end of field
|
368
364
|
field_data.delete!(END_OF_FIELD)
|
369
|
-
|
365
|
+
|
370
366
|
# add a control field or data field
|
371
367
|
if MARC::ControlField.control_tag?(tag)
|
372
|
-
field_data = MARC::Reader.set_encoding(
|
373
|
-
record.append(MARC::ControlField.new(tag,field_data))
|
368
|
+
field_data = MARC::Reader.set_encoding(field_data, params)
|
369
|
+
record.append(MARC::ControlField.new(tag, field_data))
|
374
370
|
else
|
375
371
|
field = MARC::DataField.new(tag)
|
376
372
|
|
@@ -379,17 +375,17 @@ module MARC
|
|
379
375
|
|
380
376
|
# must have at least 2 elements (indicators, and 1 subfield)
|
381
377
|
# TODO some sort of logging?
|
382
|
-
next if subfields.length
|
378
|
+
next if subfields.length < 2
|
383
379
|
|
384
380
|
# get indicators
|
385
|
-
indicators = MARC::Reader.set_encoding(
|
386
|
-
field.indicator1 = indicators[0,1]
|
387
|
-
field.indicator2 = indicators[1,1]
|
381
|
+
indicators = MARC::Reader.set_encoding(subfields.shift, params)
|
382
|
+
field.indicator1 = indicators[0, 1]
|
383
|
+
field.indicator2 = indicators[1, 1]
|
388
384
|
|
389
385
|
# add each subfield to the field
|
390
|
-
subfields.each
|
391
|
-
data = MARC::Reader.set_encoding(
|
392
|
-
subfield = MARC::Subfield.new(data[0,1],data[1..-1])
|
386
|
+
subfields.each do |data|
|
387
|
+
data = MARC::Reader.set_encoding(data, params)
|
388
|
+
subfield = MARC::Subfield.new(data[0, 1], data[1..-1])
|
393
389
|
field.append(subfield)
|
394
390
|
end
|
395
391
|
|
@@ -398,10 +394,12 @@ module MARC
|
|
398
394
|
end
|
399
395
|
end
|
400
396
|
|
401
|
-
|
402
|
-
end
|
397
|
+
raise MARC::RecordException, record unless record.valid?
|
403
398
|
|
404
|
-
|
399
|
+
record
|
400
|
+
end
|
401
|
+
|
402
|
+
# input passed in probably has 'binary' encoding.
|
405
403
|
# We'll set it to the proper encoding, and depending on settings, optionally
|
406
404
|
# * check for valid encoding
|
407
405
|
# * raise if not valid
|
@@ -411,16 +409,16 @@ module MARC
|
|
411
409
|
# Special case for encoding "MARC-8" -- will be transcoded to
|
412
410
|
# UTF-8 (then further transcoded to external_encoding, if set).
|
413
411
|
# For "MARC-8", validate_encoding is always true, there's no way to
|
414
|
-
# ignore bad bytes.
|
412
|
+
# ignore bad bytes.
|
415
413
|
#
|
416
414
|
# Params options:
|
417
|
-
#
|
418
|
-
# * external_encoding: what encoding the input is expected to be in
|
415
|
+
#
|
416
|
+
# * external_encoding: what encoding the input is expected to be in
|
419
417
|
# * validate_encoding: if true, will raise if an invalid encoding
|
420
418
|
# * invalid: if set to :replace, will replace bad bytes with replacement
|
421
|
-
# chars instead of raising.
|
419
|
+
# chars instead of raising.
|
422
420
|
# * replace: Set replacement char for use with 'invalid', otherwise defaults
|
423
|
-
# to unicode replacement char, or question mark.
|
421
|
+
# to unicode replacement char, or question mark.
|
424
422
|
def self.set_encoding(str, params)
|
425
423
|
if str.respond_to?(:force_encoding)
|
426
424
|
if params[:external_encoding]
|
@@ -430,41 +428,38 @@ module MARC
|
|
430
428
|
else
|
431
429
|
str = str.force_encoding(params[:external_encoding])
|
432
430
|
end
|
433
|
-
end
|
434
|
-
|
431
|
+
end
|
432
|
+
|
435
433
|
# If we're transcoding anyway, pass our invalid/replace options
|
436
434
|
# on to String#encode, which will take care of them -- or raise
|
437
|
-
# with illegal bytes without :replace=>:invalid.
|
435
|
+
# with illegal bytes without :replace=>:invalid.
|
438
436
|
#
|
439
437
|
# If we're NOT transcoding, we need to use our own pure-ruby
|
440
438
|
# implementation to do invalid byte replacements. OR to raise
|
441
439
|
# a predicatable exception iff :validate_encoding, otherwise
|
442
440
|
# for performance we won't check, and you may or may not
|
443
441
|
# get an exception from inside ruby-marc, and it may change
|
444
|
-
# in future implementations.
|
442
|
+
# in future implementations.
|
445
443
|
if params[:internal_encoding]
|
446
|
-
if RUBY_VERSION >=
|
447
|
-
str
|
444
|
+
str = if RUBY_VERSION >= "3.0"
|
445
|
+
str.encode(params[:internal_encoding], **params)
|
448
446
|
else
|
449
|
-
str
|
447
|
+
str.encode(params[:internal_encoding], params)
|
450
448
|
end
|
451
|
-
elsif
|
449
|
+
elsif params[:invalid] || params[:replace] || (params[:validate_encoding] == true)
|
452
450
|
|
453
|
-
if params[:validate_encoding] == true && !
|
454
|
-
raise
|
451
|
+
if params[:validate_encoding] == true && !str.valid_encoding?
|
452
|
+
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
455
453
|
end
|
456
454
|
if params[:invalid] == :replace
|
457
455
|
str = str.scrub(params[:replace])
|
458
456
|
end
|
459
|
-
|
460
|
-
end
|
461
|
-
end
|
462
|
-
return str
|
463
|
-
end
|
464
|
-
end
|
465
|
-
|
466
|
-
|
467
457
|
|
458
|
+
end
|
459
|
+
end
|
460
|
+
str
|
461
|
+
end
|
462
|
+
end
|
468
463
|
|
469
464
|
# Like Reader ForgivingReader lets you read in a batch of MARC21 records
|
470
465
|
# but it does not use record lengths and field byte offets found in the
|
@@ -479,22 +474,19 @@ module MARC
|
|
479
474
|
#
|
480
475
|
# **NOTE**: ForgivingReader _may_ have unpredictable results when used
|
481
476
|
# with marc records with char encoding other than system default (usually
|
482
|
-
# UTF8), _especially_ if you have Encoding.default_internal set.
|
477
|
+
# UTF8), _especially_ if you have Encoding.default_internal set.
|
483
478
|
#
|
484
479
|
# Implemented a sub-class of Reader over-riding #each, so we still
|
485
480
|
# get DRY Reader's #initialize with proper char encoding options
|
486
|
-
# and handling.
|
481
|
+
# and handling.
|
487
482
|
class ForgivingReader < Reader
|
488
|
-
|
489
483
|
def each
|
490
484
|
@handle.each_line(END_OF_RECORD) do |raw|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
# TODO add logging
|
497
|
-
end
|
485
|
+
record = MARC::Reader.decode(raw, @encoding_options.merge(forgiving: true))
|
486
|
+
yield record
|
487
|
+
rescue
|
488
|
+
# caught exception just keep barrelling along
|
489
|
+
# TODO add logging
|
498
490
|
end
|
499
491
|
end
|
500
492
|
end
|