marc 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +102 -30
- data/Gemfile +15 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +117 -123
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +80 -9
data/lib/marc/reader.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
require
|
1
|
+
require "scrub_rb"
|
2
2
|
|
3
3
|
# Note: requiring 'marc/marc8/to_unicode' below, in #initialize,
|
4
4
|
# only when necessary
|
5
5
|
|
6
6
|
module MARC
|
7
|
-
# A class for reading MARC binary (ISO 2709) files.
|
7
|
+
# A class for reading MARC binary (ISO 2709) files.
|
8
8
|
#
|
9
9
|
# == Character Encoding
|
10
10
|
#
|
@@ -12,7 +12,7 @@ module MARC
|
|
12
12
|
# If illegal bytes for that character encoding are encountered in certain
|
13
13
|
# operations, ruby will raise an exception. If a String is incorrectly
|
14
14
|
# tagged with the wrong character encoding, that makes it fairly likely
|
15
|
-
# an illegal byte for the specified encoding will be encountered.
|
15
|
+
# an illegal byte for the specified encoding will be encountered.
|
16
16
|
#
|
17
17
|
# So when reading binary MARC data with the MARC::Reader, it's important
|
18
18
|
# that you let it know the expected encoding:
|
@@ -21,7 +21,7 @@ module MARC
|
|
21
21
|
#
|
22
22
|
# If you leave off 'external_encoding', it will use the ruby environment
|
23
23
|
# Encoding.default_external, which is usually UTF-8 but may depend on your
|
24
|
-
# environment.
|
24
|
+
# environment.
|
25
25
|
#
|
26
26
|
# Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
|
27
27
|
# bytes. By default MARC::Reader will leave these in the produced Strings,
|
@@ -29,58 +29,58 @@ module MARC
|
|
29
29
|
# to catch this early, and ask MARC::Reader to raise immediately on illegal
|
30
30
|
# bytes:
|
31
31
|
#
|
32
|
-
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
32
|
+
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
33
33
|
# :validate_encoding => true)
|
34
34
|
#
|
35
35
|
# Alternately, you can have MARC::Reader replace illegal bytes
|
36
36
|
# with the Unicode Replacement Character, or with a string
|
37
37
|
# of your choice (including the empty string, meaning just omit the bad bytes)
|
38
38
|
#
|
39
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
39
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
40
40
|
# :invalid => :replace)
|
41
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
41
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
42
42
|
# :invalid => :replace, :replace => "")
|
43
43
|
#
|
44
44
|
# If you supply an :external_encoding argument, MARC::Reader will
|
45
45
|
# always assume that encoding -- if you leave it off, MARC::Reader
|
46
46
|
# will use the encoding tagged on any input you pass in, such
|
47
|
-
# as Strings or File handles.
|
47
|
+
# as Strings or File handles.
|
48
48
|
#
|
49
49
|
# # marc data will have same encoding as string.encoding:
|
50
50
|
# MARC::Reader.decode( string )
|
51
51
|
#
|
52
52
|
# # Same, values will have encoding of string.encoding:
|
53
|
-
# MARC::Reader.new(StringIO.new(string))
|
53
|
+
# MARC::Reader.new(StringIO.new(string))
|
54
54
|
#
|
55
55
|
# # data values will have cp866 encoding, per external_encoding of
|
56
56
|
# # File object passed in
|
57
57
|
# MARC::Reader.new(File.new("myfile.marc", "r:cp866"))
|
58
58
|
#
|
59
59
|
# # explicitly tell MARC::Reader the encoding
|
60
|
-
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
60
|
+
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
61
61
|
#
|
62
62
|
# === MARC-8
|
63
63
|
#
|
64
64
|
# The legacy MARC-8 encoding needs to be handled differently, because
|
65
|
-
# there is no built-in support in ruby for MARC-8.
|
65
|
+
# there is no built-in support in ruby for MARC-8.
|
66
66
|
#
|
67
67
|
# You _can_ specify "MARC-8" as an external encoding. It will trigger
|
68
|
-
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
68
|
+
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
69
69
|
#
|
70
70
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
|
71
71
|
#
|
72
72
|
# For external_encoding "MARC-8", :validate_encoding is always true,
|
73
73
|
# there's no way to ignore bad bytes in MARC-8 when transcoding to
|
74
|
-
# unicode. However, just as with other encodings, the
|
74
|
+
# unicode. However, just as with other encodings, the
|
75
75
|
# `:invalid => :replace` and `:replace => "string"`
|
76
|
-
# options can be used to replace bad bytes instead of raising.
|
76
|
+
# options can be used to replace bad bytes instead of raising.
|
77
77
|
#
|
78
78
|
# If you want your MARC-8 to be transcoded internally to something
|
79
79
|
# other than UTF-8, you can use the :internal_encoding option
|
80
|
-
# which works with any encoding in MARC::Reader.
|
80
|
+
# which works with any encoding in MARC::Reader.
|
81
81
|
#
|
82
|
-
# MARC::Reader.new("marc8.mrc",
|
83
|
-
# :external_encoding => "MARC-8",
|
82
|
+
# MARC::Reader.new("marc8.mrc",
|
83
|
+
# :external_encoding => "MARC-8",
|
84
84
|
# :internal_encoding => "UTF-16LE")
|
85
85
|
#
|
86
86
|
# If you want to read in MARC-8 without transcoding, leaving the
|
@@ -90,48 +90,48 @@ module MARC
|
|
90
90
|
#
|
91
91
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
|
92
92
|
#
|
93
|
-
# Please note that MARC::Reader does _not_ currently have any facilities
|
94
|
-
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
93
|
+
# Please note that MARC::Reader does _not_ currently have any facilities
|
94
|
+
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
95
95
|
#
|
96
96
|
# === Complete Encoding Options
|
97
97
|
#
|
98
98
|
# These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
|
99
99
|
# to specify external encoding, ask for a transcode to a different
|
100
|
-
# encoding on read, or validate or replace bad bytes in source.
|
100
|
+
# encoding on read, or validate or replace bad bytes in source.
|
101
101
|
#
|
102
102
|
# [:external_encoding]
|
103
103
|
# What encoding to consider the MARC record's values to be in. This option
|
104
|
-
# takes precedence over the File handle or String argument's encodings.
|
104
|
+
# takes precedence over the File handle or String argument's encodings.
|
105
105
|
# [:internal_encoding]
|
106
106
|
# Ask MARC::Reader to transcode to this encoding in memory after reading
|
107
|
-
# the file in.
|
107
|
+
# the file in.
|
108
108
|
# [:validate_encoding]
|
109
109
|
# If you pass in `true`, MARC::Reader will promise to raise an Encoding::InvalidByteSequenceError
|
110
110
|
# if there are illegal bytes in the source for the :external_encoding. There is
|
111
111
|
# a performance penalty for this check. Without this option, an exception
|
112
|
-
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
112
|
+
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
113
113
|
# what class the exception has) may change in future ruby-marc versions
|
114
|
-
# without warning.
|
114
|
+
# without warning.
|
115
115
|
# [:invalid]
|
116
116
|
# Just like String#encode, set to :replace and any bytes in source data
|
117
|
-
# illegal for the source encoding will be replaced with the unicode
|
117
|
+
# illegal for the source encoding will be replaced with the unicode
|
118
118
|
# replacement character (when in unicode encodings), or else '?'. Overrides
|
119
119
|
# :validate_encoding. This can help you sanitize your input and
|
120
|
-
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
120
|
+
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
121
121
|
# [:replace]
|
122
122
|
# Just like String#encode, combine with `:invalid=>:replace`, set
|
123
123
|
# your own replacement string for invalid bytes. You may use the
|
124
|
-
# empty string to simply eliminate invalid bytes.
|
124
|
+
# empty string to simply eliminate invalid bytes.
|
125
125
|
#
|
126
126
|
# === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
|
127
127
|
#
|
128
|
-
# Be careful with using an explicit File object with the File's own
|
129
|
-
# :internal_encoding set -- it can cause ruby to transcode your data
|
130
|
-
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
128
|
+
# Be careful with using an explicit File object with the File's own
|
129
|
+
# :internal_encoding set -- it can cause ruby to transcode your data
|
130
|
+
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
131
131
|
# marc record unreadable in some cases. This
|
132
132
|
# applies to Encoding.default_encoding too!
|
133
133
|
#
|
134
|
-
# # May in some cases result in unreadable marc and an exception
|
134
|
+
# # May in some cases result in unreadable marc and an exception
|
135
135
|
# MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:cp866:utf-8") )
|
136
136
|
#
|
137
137
|
# # May in some cases result in unreadable marc and an exception
|
@@ -156,7 +156,7 @@ module MARC
|
|
156
156
|
# https://jira.codehaus.org/browse/JRUBY-6637
|
157
157
|
#
|
158
158
|
# We recommend using the latest version of jruby, especially
|
159
|
-
# at least jruby 1.7.6.
|
159
|
+
# at least jruby 1.7.6.
|
160
160
|
class Reader
|
161
161
|
include Enumerable
|
162
162
|
|
@@ -182,43 +182,42 @@ module MARC
|
|
182
182
|
#
|
183
183
|
# Also, if your data encoded with non ascii/utf-8 encoding
|
184
184
|
# (for ex. when reading RUSMARC data) and you use ruby 1.9
|
185
|
-
# you can specify source data encoding with an option.
|
185
|
+
# you can specify source data encoding with an option.
|
186
186
|
#
|
187
187
|
# reader = MARC::Reader.new('marc.dat', :external_encoding => 'cp866')
|
188
188
|
#
|
189
189
|
# or, you can pass IO, opened in the corresponding encoding
|
190
190
|
#
|
191
191
|
# reader = MARC::Reader.new(File.new('marc.dat', 'r:cp866'))
|
192
|
-
def initialize(file, options = {})
|
192
|
+
def initialize(file, options = {})
|
193
193
|
@encoding_options = {}
|
194
194
|
# all can be nil
|
195
195
|
[:internal_encoding, :external_encoding, :invalid, :replace, :validate_encoding].each do |key|
|
196
196
|
@encoding_options[key] = options[key] if options.has_key?(key)
|
197
197
|
end
|
198
|
-
|
199
|
-
if file.is_a?(String)
|
198
|
+
|
199
|
+
if file.is_a?(String)
|
200
200
|
@handle = File.new(file)
|
201
|
-
elsif file.respond_to?(
|
201
|
+
elsif file.respond_to?(:read, 5)
|
202
202
|
@handle = file
|
203
203
|
else
|
204
204
|
raise ArgumentError, "must pass in path or file"
|
205
205
|
end
|
206
|
-
|
207
|
-
if (
|
206
|
+
|
207
|
+
if (!@encoding_options[:external_encoding]) && @handle.respond_to?(:external_encoding)
|
208
208
|
# use file encoding only if we didn't already have an explicit one,
|
209
|
-
# explicit one takes precedence.
|
209
|
+
# explicit one takes precedence.
|
210
210
|
#
|
211
211
|
# Note, please don't use ruby's own internal_encoding transcode
|
212
212
|
# with binary marc data, the transcode can mess up the byte count
|
213
|
-
# and make it unreadable.
|
213
|
+
# and make it unreadable.
|
214
214
|
@encoding_options[:external_encoding] ||= @handle.external_encoding
|
215
215
|
end
|
216
216
|
|
217
217
|
# Only pull in the MARC8 translation if we need it, since it's really big
|
218
|
-
if @encoding_options[:external_encoding]
|
219
|
-
require
|
218
|
+
if @encoding_options[:external_encoding] == "MARC-8"
|
219
|
+
require "marc/marc8/to_unicode" unless defined? MARC::Marc8::ToUnicode
|
220
220
|
end
|
221
|
-
|
222
221
|
end
|
223
222
|
|
224
223
|
# to support iteration:
|
@@ -226,13 +225,13 @@ module MARC
|
|
226
225
|
# print record
|
227
226
|
# end
|
228
227
|
def each
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
self.each_raw do |raw|
|
233
|
-
record = self.decode(raw)
|
228
|
+
if block_given?
|
229
|
+
each_raw do |raw|
|
230
|
+
record = decode(raw)
|
234
231
|
yield record
|
235
232
|
end
|
233
|
+
else
|
234
|
+
enum_for(:each)
|
236
235
|
end
|
237
236
|
end
|
238
237
|
|
@@ -257,10 +256,8 @@ module MARC
|
|
257
256
|
#
|
258
257
|
# If no block is given, an enumerator is returned
|
259
258
|
def each_raw
|
260
|
-
|
261
|
-
|
262
|
-
else
|
263
|
-
while rec_length_s = @handle.read(5)
|
259
|
+
if block_given?
|
260
|
+
while (rec_length_s = @handle.read(5))
|
264
261
|
# make sure the record length looks like an integer
|
265
262
|
rec_length_i = rec_length_s.to_i
|
266
263
|
if rec_length_i == 0
|
@@ -269,9 +266,11 @@ module MARC
|
|
269
266
|
|
270
267
|
# get the raw MARC21 for a record back from the file
|
271
268
|
# using the record length
|
272
|
-
raw = rec_length_s + @handle.read(rec_length_i-5)
|
269
|
+
raw = rec_length_s + @handle.read(rec_length_i - 5)
|
273
270
|
yield raw
|
274
271
|
end
|
272
|
+
else
|
273
|
+
enum_for(:each_raw)
|
275
274
|
end
|
276
275
|
end
|
277
276
|
|
@@ -280,7 +279,7 @@ module MARC
|
|
280
279
|
# Wraps the class method MARC::Reader::decode, using the encoding options of
|
281
280
|
# the MARC::Reader instance.
|
282
281
|
def decode(marc)
|
283
|
-
|
282
|
+
MARC::Reader.decode(marc, @encoding_options)
|
284
283
|
end
|
285
284
|
|
286
285
|
# A static method for turning raw MARC data in transission
|
@@ -288,34 +287,34 @@ module MARC
|
|
288
287
|
# First argument is a String
|
289
288
|
# options include:
|
290
289
|
# [:external_encoding] encoding of MARC record data values
|
291
|
-
# [:forgiving] needs more docs, true is some kind of forgiving
|
292
|
-
# of certain kinds of bad MARC.
|
293
|
-
def self.decode(marc, params={})
|
290
|
+
# [:forgiving] needs more docs, true is some kind of forgiving
|
291
|
+
# of certain kinds of bad MARC.
|
292
|
+
def self.decode(marc, params = {})
|
294
293
|
if params.has_key?(:encoding)
|
295
|
-
|
294
|
+
warn "DEPRECATION WARNING: MARC::Reader.decode :encoding option deprecated, please use :external_encoding"
|
296
295
|
params[:external_encoding] = params.delete(:encoding)
|
297
296
|
end
|
298
|
-
|
299
|
-
if (!
|
297
|
+
|
298
|
+
if (!params.has_key? :external_encoding) && marc.respond_to?(:encoding)
|
300
299
|
# If no forced external_encoding giving, respect the encoding
|
301
|
-
# declared on the string passed in.
|
300
|
+
# declared on the string passed in.
|
302
301
|
params[:external_encoding] = marc.encoding
|
303
302
|
end
|
304
303
|
# And now that we've recorded the current encoding, we force
|
305
304
|
# to binary encoding, because we're going to be doing byte arithmetic,
|
306
|
-
# and want to avoid byte-vs-char confusion.
|
305
|
+
# and want to avoid byte-vs-char confusion.
|
307
306
|
marc.force_encoding("binary") if marc.respond_to?(:force_encoding)
|
308
|
-
|
309
|
-
record = Record.new
|
310
|
-
record.leader = marc[0..LEADER_LENGTH-1]
|
307
|
+
|
308
|
+
record = Record.new
|
309
|
+
record.leader = marc[0..LEADER_LENGTH - 1]
|
311
310
|
|
312
311
|
# where the field data starts
|
313
312
|
base_address = record.leader[12..16].to_i
|
314
313
|
|
315
314
|
# get the byte offsets from the record directory
|
316
|
-
directory = marc[LEADER_LENGTH..base_address-1]
|
315
|
+
directory = marc[LEADER_LENGTH..base_address - 1]
|
317
316
|
|
318
|
-
raise MARC::Exception.new("invalid directory in record") if directory
|
317
|
+
raise MARC::Exception.new("invalid directory in record") if directory.nil?
|
319
318
|
|
320
319
|
# the number of fields in the record corresponds to
|
321
320
|
# how many directory entries there are
|
@@ -324,20 +323,19 @@ module MARC
|
|
324
323
|
# when operating in forgiving mode we just split on end of
|
325
324
|
# field instead of using calculated byte offsets from the
|
326
325
|
# directory
|
327
|
-
if params[:forgiving]
|
326
|
+
if params[:forgiving]
|
328
327
|
marc_field_data = marc[base_address..-1]
|
329
328
|
# It won't let us do the split on bad utf8 data, but
|
330
329
|
# we haven't yet set the 'proper' encoding or used
|
331
330
|
# our correction/replace options. So call it binary for now.
|
332
331
|
marc_field_data.force_encoding("binary") if marc_field_data.respond_to?(:force_encoding)
|
333
|
-
|
332
|
+
|
334
333
|
all_fields = marc_field_data.split(END_OF_FIELD)
|
335
334
|
else
|
336
|
-
mba =
|
335
|
+
mba = marc.bytes.to_a
|
337
336
|
end
|
338
337
|
|
339
|
-
0.upto(num_fields-1) do |field_num|
|
340
|
-
|
338
|
+
0.upto(num_fields - 1) do |field_num|
|
341
339
|
# pull the directory entry for a field out
|
342
340
|
entry_start = field_num * DIRECTORY_ENTRY_LENGTH
|
343
341
|
entry_end = entry_start + DIRECTORY_ENTRY_LENGTH
|
@@ -350,12 +348,12 @@ module MARC
|
|
350
348
|
# if we were told to be forgiving we just use the
|
351
349
|
# next available chuck of field data that we
|
352
350
|
# split apart based on the END_OF_FIELD
|
353
|
-
field_data =
|
351
|
+
field_data = ""
|
354
352
|
if params[:forgiving]
|
355
|
-
field_data = all_fields.shift
|
353
|
+
field_data = all_fields.shift
|
356
354
|
|
357
|
-
|
358
|
-
|
355
|
+
# otherwise we actually use the byte offsets in
|
356
|
+
# directory to figure out what field data to extract
|
359
357
|
else
|
360
358
|
length = entry[3..6].to_i
|
361
359
|
offset = entry[7..11].to_i
|
@@ -366,11 +364,11 @@ module MARC
|
|
366
364
|
|
367
365
|
# remove end of field
|
368
366
|
field_data.delete!(END_OF_FIELD)
|
369
|
-
|
367
|
+
|
370
368
|
# add a control field or data field
|
371
369
|
if MARC::ControlField.control_tag?(tag)
|
372
|
-
field_data = MARC::Reader.set_encoding(
|
373
|
-
record.append(MARC::ControlField.new(tag,field_data))
|
370
|
+
field_data = MARC::Reader.set_encoding(field_data, params)
|
371
|
+
record.append(MARC::ControlField.new(tag, field_data))
|
374
372
|
else
|
375
373
|
field = MARC::DataField.new(tag)
|
376
374
|
|
@@ -379,17 +377,17 @@ module MARC
|
|
379
377
|
|
380
378
|
# must have at least 2 elements (indicators, and 1 subfield)
|
381
379
|
# TODO some sort of logging?
|
382
|
-
next if subfields.length
|
380
|
+
next if subfields.length < 2
|
383
381
|
|
384
382
|
# get indicators
|
385
|
-
indicators = MARC::Reader.set_encoding(
|
386
|
-
field.indicator1 = indicators[0,1]
|
387
|
-
field.indicator2 = indicators[1,1]
|
383
|
+
indicators = MARC::Reader.set_encoding(subfields.shift, params)
|
384
|
+
field.indicator1 = indicators[0, 1]
|
385
|
+
field.indicator2 = indicators[1, 1]
|
388
386
|
|
389
387
|
# add each subfield to the field
|
390
|
-
subfields.each
|
391
|
-
data = MARC::Reader.set_encoding(
|
392
|
-
subfield = MARC::Subfield.new(data[0,1],data[1..-1])
|
388
|
+
subfields.each do |data|
|
389
|
+
data = MARC::Reader.set_encoding(data, params)
|
390
|
+
subfield = MARC::Subfield.new(data[0, 1], data[1..-1])
|
393
391
|
field.append(subfield)
|
394
392
|
end
|
395
393
|
|
@@ -398,10 +396,12 @@ module MARC
|
|
398
396
|
end
|
399
397
|
end
|
400
398
|
|
401
|
-
|
402
|
-
|
399
|
+
raise MARC::RecordException, record unless record.valid?
|
400
|
+
|
401
|
+
record
|
402
|
+
end
|
403
403
|
|
404
|
-
# input passed in probably has 'binary' encoding.
|
404
|
+
# input passed in probably has 'binary' encoding.
|
405
405
|
# We'll set it to the proper encoding, and depending on settings, optionally
|
406
406
|
# * check for valid encoding
|
407
407
|
# * raise if not valid
|
@@ -411,16 +411,16 @@ module MARC
|
|
411
411
|
# Special case for encoding "MARC-8" -- will be transcoded to
|
412
412
|
# UTF-8 (then further transcoded to external_encoding, if set).
|
413
413
|
# For "MARC-8", validate_encoding is always true, there's no way to
|
414
|
-
# ignore bad bytes.
|
414
|
+
# ignore bad bytes.
|
415
415
|
#
|
416
416
|
# Params options:
|
417
|
-
#
|
418
|
-
# * external_encoding: what encoding the input is expected to be in
|
417
|
+
#
|
418
|
+
# * external_encoding: what encoding the input is expected to be in
|
419
419
|
# * validate_encoding: if true, will raise if an invalid encoding
|
420
420
|
# * invalid: if set to :replace, will replace bad bytes with replacement
|
421
|
-
# chars instead of raising.
|
421
|
+
# chars instead of raising.
|
422
422
|
# * replace: Set replacement char for use with 'invalid', otherwise defaults
|
423
|
-
# to unicode replacement char, or question mark.
|
423
|
+
# to unicode replacement char, or question mark.
|
424
424
|
def self.set_encoding(str, params)
|
425
425
|
if str.respond_to?(:force_encoding)
|
426
426
|
if params[:external_encoding]
|
@@ -430,41 +430,38 @@ module MARC
|
|
430
430
|
else
|
431
431
|
str = str.force_encoding(params[:external_encoding])
|
432
432
|
end
|
433
|
-
end
|
434
|
-
|
433
|
+
end
|
434
|
+
|
435
435
|
# If we're transcoding anyway, pass our invalid/replace options
|
436
436
|
# on to String#encode, which will take care of them -- or raise
|
437
|
-
# with illegal bytes without :replace=>:invalid.
|
437
|
+
# with illegal bytes without :replace=>:invalid.
|
438
438
|
#
|
439
439
|
# If we're NOT transcoding, we need to use our own pure-ruby
|
440
440
|
# implementation to do invalid byte replacements. OR to raise
|
441
441
|
# a predicatable exception iff :validate_encoding, otherwise
|
442
442
|
# for performance we won't check, and you may or may not
|
443
443
|
# get an exception from inside ruby-marc, and it may change
|
444
|
-
# in future implementations.
|
444
|
+
# in future implementations.
|
445
445
|
if params[:internal_encoding]
|
446
|
-
if RUBY_VERSION >=
|
447
|
-
str
|
446
|
+
str = if RUBY_VERSION >= "3.0"
|
447
|
+
str.encode(params[:internal_encoding], **params)
|
448
448
|
else
|
449
|
-
str
|
449
|
+
str.encode(params[:internal_encoding], params)
|
450
450
|
end
|
451
|
-
elsif
|
451
|
+
elsif params[:invalid] || params[:replace] || (params[:validate_encoding] == true)
|
452
452
|
|
453
|
-
if params[:validate_encoding] == true && !
|
454
|
-
raise
|
453
|
+
if params[:validate_encoding] == true && !str.valid_encoding?
|
454
|
+
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
455
455
|
end
|
456
456
|
if params[:invalid] == :replace
|
457
457
|
str = str.scrub(params[:replace])
|
458
458
|
end
|
459
|
-
|
460
|
-
end
|
461
|
-
end
|
462
|
-
return str
|
463
|
-
end
|
464
|
-
end
|
465
|
-
|
466
|
-
|
467
459
|
|
460
|
+
end
|
461
|
+
end
|
462
|
+
str
|
463
|
+
end
|
464
|
+
end
|
468
465
|
|
469
466
|
# Like Reader ForgivingReader lets you read in a batch of MARC21 records
|
470
467
|
# but it does not use record lengths and field byte offets found in the
|
@@ -479,22 +476,19 @@ module MARC
|
|
479
476
|
#
|
480
477
|
# **NOTE**: ForgivingReader _may_ have unpredictable results when used
|
481
478
|
# with marc records with char encoding other than system default (usually
|
482
|
-
# UTF8), _especially_ if you have Encoding.default_internal set.
|
479
|
+
# UTF8), _especially_ if you have Encoding.default_internal set.
|
483
480
|
#
|
484
481
|
# Implemented a sub-class of Reader over-riding #each, so we still
|
485
482
|
# get DRY Reader's #initialize with proper char encoding options
|
486
|
-
# and handling.
|
483
|
+
# and handling.
|
487
484
|
class ForgivingReader < Reader
|
488
|
-
|
489
485
|
def each
|
490
486
|
@handle.each_line(END_OF_RECORD) do |raw|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
# TODO add logging
|
497
|
-
end
|
487
|
+
record = MARC::Reader.decode(raw, @encoding_options.merge(forgiving: true))
|
488
|
+
yield record
|
489
|
+
rescue
|
490
|
+
# caught exception just keep barrelling along
|
491
|
+
# TODO add logging
|
498
492
|
end
|
499
493
|
end
|
500
494
|
end
|