marc 1.1.1 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +116 -30
- data/Gemfile +5 -0
- data/README.md +239 -46
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -87
- data/lib/marc/reader.rb +116 -124
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -82
- data/lib/marc.rb +20 -18
- data/marc.gemspec +28 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -34
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +101 -94
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +129 -22
data/lib/marc/reader.rb
CHANGED
@@ -1,10 +1,8 @@
|
|
1
|
-
require 'scrub_rb'
|
2
|
-
|
3
1
|
# Note: requiring 'marc/marc8/to_unicode' below, in #initialize,
|
4
2
|
# only when necessary
|
5
3
|
|
6
4
|
module MARC
|
7
|
-
# A class for reading MARC binary (ISO 2709) files.
|
5
|
+
# A class for reading MARC binary (ISO 2709) files.
|
8
6
|
#
|
9
7
|
# == Character Encoding
|
10
8
|
#
|
@@ -12,7 +10,7 @@ module MARC
|
|
12
10
|
# If illegal bytes for that character encoding are encountered in certain
|
13
11
|
# operations, ruby will raise an exception. If a String is incorrectly
|
14
12
|
# tagged with the wrong character encoding, that makes it fairly likely
|
15
|
-
# an illegal byte for the specified encoding will be encountered.
|
13
|
+
# an illegal byte for the specified encoding will be encountered.
|
16
14
|
#
|
17
15
|
# So when reading binary MARC data with the MARC::Reader, it's important
|
18
16
|
# that you let it know the expected encoding:
|
@@ -21,7 +19,7 @@ module MARC
|
|
21
19
|
#
|
22
20
|
# If you leave off 'external_encoding', it will use the ruby environment
|
23
21
|
# Encoding.default_external, which is usually UTF-8 but may depend on your
|
24
|
-
# environment.
|
22
|
+
# environment.
|
25
23
|
#
|
26
24
|
# Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
|
27
25
|
# bytes. By default MARC::Reader will leave these in the produced Strings,
|
@@ -29,58 +27,58 @@ module MARC
|
|
29
27
|
# to catch this early, and ask MARC::Reader to raise immediately on illegal
|
30
28
|
# bytes:
|
31
29
|
#
|
32
|
-
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
30
|
+
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
33
31
|
# :validate_encoding => true)
|
34
32
|
#
|
35
33
|
# Alternately, you can have MARC::Reader replace illegal bytes
|
36
34
|
# with the Unicode Replacement Character, or with a string
|
37
35
|
# of your choice (including the empty string, meaning just omit the bad bytes)
|
38
36
|
#
|
39
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
37
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
40
38
|
# :invalid => :replace)
|
41
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
39
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
42
40
|
# :invalid => :replace, :replace => "")
|
43
41
|
#
|
44
42
|
# If you supply an :external_encoding argument, MARC::Reader will
|
45
43
|
# always assume that encoding -- if you leave it off, MARC::Reader
|
46
44
|
# will use the encoding tagged on any input you pass in, such
|
47
|
-
# as Strings or File handles.
|
45
|
+
# as Strings or File handles.
|
48
46
|
#
|
49
47
|
# # marc data will have same encoding as string.encoding:
|
50
48
|
# MARC::Reader.decode( string )
|
51
49
|
#
|
52
50
|
# # Same, values will have encoding of string.encoding:
|
53
|
-
# MARC::Reader.new(StringIO.new(string))
|
51
|
+
# MARC::Reader.new(StringIO.new(string))
|
54
52
|
#
|
55
53
|
# # data values will have cp866 encoding, per external_encoding of
|
56
54
|
# # File object passed in
|
57
55
|
# MARC::Reader.new(File.new("myfile.marc", "r:cp866"))
|
58
56
|
#
|
59
57
|
# # explicitly tell MARC::Reader the encoding
|
60
|
-
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
58
|
+
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
61
59
|
#
|
62
60
|
# === MARC-8
|
63
61
|
#
|
64
62
|
# The legacy MARC-8 encoding needs to be handled differently, because
|
65
|
-
# there is no built-in support in ruby for MARC-8.
|
63
|
+
# there is no built-in support in ruby for MARC-8.
|
66
64
|
#
|
67
65
|
# You _can_ specify "MARC-8" as an external encoding. It will trigger
|
68
|
-
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
66
|
+
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
69
67
|
#
|
70
68
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
|
71
69
|
#
|
72
70
|
# For external_encoding "MARC-8", :validate_encoding is always true,
|
73
71
|
# there's no way to ignore bad bytes in MARC-8 when transcoding to
|
74
|
-
# unicode. However, just as with other encodings, the
|
72
|
+
# unicode. However, just as with other encodings, the
|
75
73
|
# `:invalid => :replace` and `:replace => "string"`
|
76
|
-
# options can be used to replace bad bytes instead of raising.
|
74
|
+
# options can be used to replace bad bytes instead of raising.
|
77
75
|
#
|
78
76
|
# If you want your MARC-8 to be transcoded internally to something
|
79
77
|
# other than UTF-8, you can use the :internal_encoding option
|
80
|
-
# which works with any encoding in MARC::Reader.
|
78
|
+
# which works with any encoding in MARC::Reader.
|
81
79
|
#
|
82
|
-
# MARC::Reader.new("marc8.mrc",
|
83
|
-
# :external_encoding => "MARC-8",
|
80
|
+
# MARC::Reader.new("marc8.mrc",
|
81
|
+
# :external_encoding => "MARC-8",
|
84
82
|
# :internal_encoding => "UTF-16LE")
|
85
83
|
#
|
86
84
|
# If you want to read in MARC-8 without transcoding, leaving the
|
@@ -90,48 +88,48 @@ module MARC
|
|
90
88
|
#
|
91
89
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
|
92
90
|
#
|
93
|
-
# Please note that MARC::Reader does _not_ currently have any facilities
|
94
|
-
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
91
|
+
# Please note that MARC::Reader does _not_ currently have any facilities
|
92
|
+
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
95
93
|
#
|
96
94
|
# === Complete Encoding Options
|
97
95
|
#
|
98
96
|
# These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
|
99
97
|
# to specify external encoding, ask for a transcode to a different
|
100
|
-
# encoding on read, or validate or replace bad bytes in source.
|
98
|
+
# encoding on read, or validate or replace bad bytes in source.
|
101
99
|
#
|
102
100
|
# [:external_encoding]
|
103
101
|
# What encoding to consider the MARC record's values to be in. This option
|
104
|
-
# takes precedence over the File handle or String argument's encodings.
|
102
|
+
# takes precedence over the File handle or String argument's encodings.
|
105
103
|
# [:internal_encoding]
|
106
104
|
# Ask MARC::Reader to transcode to this encoding in memory after reading
|
107
|
-
# the file in.
|
105
|
+
# the file in.
|
108
106
|
# [:validate_encoding]
|
109
107
|
# If you pass in `true`, MARC::Reader will promise to raise an Encoding::InvalidByteSequenceError
|
110
108
|
# if there are illegal bytes in the source for the :external_encoding. There is
|
111
109
|
# a performance penalty for this check. Without this option, an exception
|
112
|
-
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
110
|
+
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
113
111
|
# what class the exception has) may change in future ruby-marc versions
|
114
|
-
# without warning.
|
112
|
+
# without warning.
|
115
113
|
# [:invalid]
|
116
114
|
# Just like String#encode, set to :replace and any bytes in source data
|
117
|
-
# illegal for the source encoding will be replaced with the unicode
|
115
|
+
# illegal for the source encoding will be replaced with the unicode
|
118
116
|
# replacement character (when in unicode encodings), or else '?'. Overrides
|
119
117
|
# :validate_encoding. This can help you sanitize your input and
|
120
|
-
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
118
|
+
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
121
119
|
# [:replace]
|
122
120
|
# Just like String#encode, combine with `:invalid=>:replace`, set
|
123
121
|
# your own replacement string for invalid bytes. You may use the
|
124
|
-
# empty string to simply eliminate invalid bytes.
|
122
|
+
# empty string to simply eliminate invalid bytes.
|
125
123
|
#
|
126
124
|
# === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
|
127
125
|
#
|
128
|
-
# Be careful with using an explicit File object with the File's own
|
129
|
-
# :internal_encoding set -- it can cause ruby to transcode your data
|
130
|
-
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
126
|
+
# Be careful with using an explicit File object with the File's own
|
127
|
+
# :internal_encoding set -- it can cause ruby to transcode your data
|
128
|
+
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
131
129
|
# marc record unreadable in some cases. This
|
132
130
|
# applies to Encoding.default_encoding too!
|
133
131
|
#
|
134
|
-
# # May in some cases result in unreadable marc and an exception
|
132
|
+
# # May in some cases result in unreadable marc and an exception
|
135
133
|
# MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:cp866:utf-8") )
|
136
134
|
#
|
137
135
|
# # May in some cases result in unreadable marc and an exception
|
@@ -156,7 +154,7 @@ module MARC
|
|
156
154
|
# https://jira.codehaus.org/browse/JRUBY-6637
|
157
155
|
#
|
158
156
|
# We recommend using the latest version of jruby, especially
|
159
|
-
# at least jruby 1.7.6.
|
157
|
+
# at least jruby 1.7.6.
|
160
158
|
class Reader
|
161
159
|
include Enumerable
|
162
160
|
|
@@ -182,43 +180,42 @@ module MARC
|
|
182
180
|
#
|
183
181
|
# Also, if your data encoded with non ascii/utf-8 encoding
|
184
182
|
# (for ex. when reading RUSMARC data) and you use ruby 1.9
|
185
|
-
# you can specify source data encoding with an option.
|
183
|
+
# you can specify source data encoding with an option.
|
186
184
|
#
|
187
185
|
# reader = MARC::Reader.new('marc.dat', :external_encoding => 'cp866')
|
188
186
|
#
|
189
187
|
# or, you can pass IO, opened in the corresponding encoding
|
190
188
|
#
|
191
189
|
# reader = MARC::Reader.new(File.new('marc.dat', 'r:cp866'))
|
192
|
-
def initialize(file, options = {})
|
190
|
+
def initialize(file, options = {})
|
193
191
|
@encoding_options = {}
|
194
192
|
# all can be nil
|
195
193
|
[:internal_encoding, :external_encoding, :invalid, :replace, :validate_encoding].each do |key|
|
196
194
|
@encoding_options[key] = options[key] if options.has_key?(key)
|
197
195
|
end
|
198
|
-
|
199
|
-
if file.is_a?(String)
|
196
|
+
|
197
|
+
if file.is_a?(String)
|
200
198
|
@handle = File.new(file)
|
201
|
-
elsif file.respond_to?(
|
199
|
+
elsif file.respond_to?(:read, 5)
|
202
200
|
@handle = file
|
203
201
|
else
|
204
202
|
raise ArgumentError, "must pass in path or file"
|
205
203
|
end
|
206
|
-
|
207
|
-
if (
|
204
|
+
|
205
|
+
if (!@encoding_options[:external_encoding]) && @handle.respond_to?(:external_encoding)
|
208
206
|
# use file encoding only if we didn't already have an explicit one,
|
209
|
-
# explicit one takes precedence.
|
207
|
+
# explicit one takes precedence.
|
210
208
|
#
|
211
209
|
# Note, please don't use ruby's own internal_encoding transcode
|
212
210
|
# with binary marc data, the transcode can mess up the byte count
|
213
|
-
# and make it unreadable.
|
211
|
+
# and make it unreadable.
|
214
212
|
@encoding_options[:external_encoding] ||= @handle.external_encoding
|
215
213
|
end
|
216
214
|
|
217
215
|
# Only pull in the MARC8 translation if we need it, since it's really big
|
218
|
-
if @encoding_options[:external_encoding]
|
219
|
-
require
|
216
|
+
if @encoding_options[:external_encoding] == "MARC-8"
|
217
|
+
require "marc/marc8/to_unicode" unless defined? MARC::Marc8::ToUnicode
|
220
218
|
end
|
221
|
-
|
222
219
|
end
|
223
220
|
|
224
221
|
# to support iteration:
|
@@ -226,13 +223,13 @@ module MARC
|
|
226
223
|
# print record
|
227
224
|
# end
|
228
225
|
def each
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
self.each_raw do |raw|
|
233
|
-
record = self.decode(raw)
|
226
|
+
if block_given?
|
227
|
+
each_raw do |raw|
|
228
|
+
record = decode(raw)
|
234
229
|
yield record
|
235
230
|
end
|
231
|
+
else
|
232
|
+
enum_for(:each)
|
236
233
|
end
|
237
234
|
end
|
238
235
|
|
@@ -257,10 +254,8 @@ module MARC
|
|
257
254
|
#
|
258
255
|
# If no block is given, an enumerator is returned
|
259
256
|
def each_raw
|
260
|
-
|
261
|
-
|
262
|
-
else
|
263
|
-
while rec_length_s = @handle.read(5)
|
257
|
+
if block_given?
|
258
|
+
while (rec_length_s = @handle.read(5))
|
264
259
|
# make sure the record length looks like an integer
|
265
260
|
rec_length_i = rec_length_s.to_i
|
266
261
|
if rec_length_i == 0
|
@@ -269,9 +264,11 @@ module MARC
|
|
269
264
|
|
270
265
|
# get the raw MARC21 for a record back from the file
|
271
266
|
# using the record length
|
272
|
-
raw = rec_length_s + @handle.read(rec_length_i-5)
|
267
|
+
raw = rec_length_s + @handle.read(rec_length_i - 5)
|
273
268
|
yield raw
|
274
269
|
end
|
270
|
+
else
|
271
|
+
enum_for(:each_raw)
|
275
272
|
end
|
276
273
|
end
|
277
274
|
|
@@ -280,7 +277,7 @@ module MARC
|
|
280
277
|
# Wraps the class method MARC::Reader::decode, using the encoding options of
|
281
278
|
# the MARC::Reader instance.
|
282
279
|
def decode(marc)
|
283
|
-
|
280
|
+
MARC::Reader.decode(marc, @encoding_options)
|
284
281
|
end
|
285
282
|
|
286
283
|
# A static method for turning raw MARC data in transission
|
@@ -288,34 +285,34 @@ module MARC
|
|
288
285
|
# First argument is a String
|
289
286
|
# options include:
|
290
287
|
# [:external_encoding] encoding of MARC record data values
|
291
|
-
# [:forgiving] needs more docs, true is some kind of forgiving
|
292
|
-
# of certain kinds of bad MARC.
|
293
|
-
def self.decode(marc, params={})
|
288
|
+
# [:forgiving] needs more docs, true is some kind of forgiving
|
289
|
+
# of certain kinds of bad MARC.
|
290
|
+
def self.decode(marc, params = {})
|
294
291
|
if params.has_key?(:encoding)
|
295
|
-
|
292
|
+
warn "DEPRECATION WARNING: MARC::Reader.decode :encoding option deprecated, please use :external_encoding"
|
296
293
|
params[:external_encoding] = params.delete(:encoding)
|
297
294
|
end
|
298
|
-
|
299
|
-
if (!
|
295
|
+
|
296
|
+
if (!params.has_key? :external_encoding) && marc.respond_to?(:encoding)
|
300
297
|
# If no forced external_encoding giving, respect the encoding
|
301
|
-
# declared on the string passed in.
|
298
|
+
# declared on the string passed in.
|
302
299
|
params[:external_encoding] = marc.encoding
|
303
300
|
end
|
304
301
|
# And now that we've recorded the current encoding, we force
|
305
302
|
# to binary encoding, because we're going to be doing byte arithmetic,
|
306
|
-
# and want to avoid byte-vs-char confusion.
|
303
|
+
# and want to avoid byte-vs-char confusion.
|
307
304
|
marc.force_encoding("binary") if marc.respond_to?(:force_encoding)
|
308
|
-
|
309
|
-
record = Record.new
|
310
|
-
record.leader = marc[0..LEADER_LENGTH-1]
|
305
|
+
|
306
|
+
record = Record.new
|
307
|
+
record.leader = marc[0..LEADER_LENGTH - 1]
|
311
308
|
|
312
309
|
# where the field data starts
|
313
310
|
base_address = record.leader[12..16].to_i
|
314
311
|
|
315
312
|
# get the byte offsets from the record directory
|
316
|
-
directory = marc[LEADER_LENGTH..base_address-1]
|
313
|
+
directory = marc[LEADER_LENGTH..base_address - 1]
|
317
314
|
|
318
|
-
raise MARC::Exception.new("invalid directory in record") if directory
|
315
|
+
raise MARC::Exception.new("invalid directory in record") if directory.nil?
|
319
316
|
|
320
317
|
# the number of fields in the record corresponds to
|
321
318
|
# how many directory entries there are
|
@@ -324,20 +321,19 @@ module MARC
|
|
324
321
|
# when operating in forgiving mode we just split on end of
|
325
322
|
# field instead of using calculated byte offsets from the
|
326
323
|
# directory
|
327
|
-
if params[:forgiving]
|
324
|
+
if params[:forgiving]
|
328
325
|
marc_field_data = marc[base_address..-1]
|
329
326
|
# It won't let us do the split on bad utf8 data, but
|
330
327
|
# we haven't yet set the 'proper' encoding or used
|
331
328
|
# our correction/replace options. So call it binary for now.
|
332
329
|
marc_field_data.force_encoding("binary") if marc_field_data.respond_to?(:force_encoding)
|
333
|
-
|
330
|
+
|
334
331
|
all_fields = marc_field_data.split(END_OF_FIELD)
|
335
332
|
else
|
336
|
-
mba =
|
333
|
+
mba = marc.bytes.to_a
|
337
334
|
end
|
338
335
|
|
339
|
-
0.upto(num_fields-1) do |field_num|
|
340
|
-
|
336
|
+
0.upto(num_fields - 1) do |field_num|
|
341
337
|
# pull the directory entry for a field out
|
342
338
|
entry_start = field_num * DIRECTORY_ENTRY_LENGTH
|
343
339
|
entry_end = entry_start + DIRECTORY_ENTRY_LENGTH
|
@@ -350,12 +346,12 @@ module MARC
|
|
350
346
|
# if we were told to be forgiving we just use the
|
351
347
|
# next available chuck of field data that we
|
352
348
|
# split apart based on the END_OF_FIELD
|
353
|
-
field_data =
|
349
|
+
field_data = ""
|
354
350
|
if params[:forgiving]
|
355
|
-
field_data = all_fields.shift
|
351
|
+
field_data = all_fields.shift
|
356
352
|
|
357
|
-
|
358
|
-
|
353
|
+
# otherwise we actually use the byte offsets in
|
354
|
+
# directory to figure out what field data to extract
|
359
355
|
else
|
360
356
|
length = entry[3..6].to_i
|
361
357
|
offset = entry[7..11].to_i
|
@@ -366,11 +362,11 @@ module MARC
|
|
366
362
|
|
367
363
|
# remove end of field
|
368
364
|
field_data.delete!(END_OF_FIELD)
|
369
|
-
|
365
|
+
|
370
366
|
# add a control field or data field
|
371
367
|
if MARC::ControlField.control_tag?(tag)
|
372
|
-
field_data = MARC::Reader.set_encoding(
|
373
|
-
record.append(MARC::ControlField.new(tag,field_data))
|
368
|
+
field_data = MARC::Reader.set_encoding(field_data, params)
|
369
|
+
record.append(MARC::ControlField.new(tag, field_data))
|
374
370
|
else
|
375
371
|
field = MARC::DataField.new(tag)
|
376
372
|
|
@@ -379,17 +375,17 @@ module MARC
|
|
379
375
|
|
380
376
|
# must have at least 2 elements (indicators, and 1 subfield)
|
381
377
|
# TODO some sort of logging?
|
382
|
-
next if subfields.length
|
378
|
+
next if subfields.length < 2
|
383
379
|
|
384
380
|
# get indicators
|
385
|
-
indicators = MARC::Reader.set_encoding(
|
386
|
-
field.indicator1 = indicators[0,1]
|
387
|
-
field.indicator2 = indicators[1,1]
|
381
|
+
indicators = MARC::Reader.set_encoding(subfields.shift, params)
|
382
|
+
field.indicator1 = indicators[0, 1]
|
383
|
+
field.indicator2 = indicators[1, 1]
|
388
384
|
|
389
385
|
# add each subfield to the field
|
390
|
-
subfields.each
|
391
|
-
data = MARC::Reader.set_encoding(
|
392
|
-
subfield = MARC::Subfield.new(data[0,1],data[1..-1])
|
386
|
+
subfields.each do |data|
|
387
|
+
data = MARC::Reader.set_encoding(data, params)
|
388
|
+
subfield = MARC::Subfield.new(data[0, 1], data[1..-1])
|
393
389
|
field.append(subfield)
|
394
390
|
end
|
395
391
|
|
@@ -398,10 +394,12 @@ module MARC
|
|
398
394
|
end
|
399
395
|
end
|
400
396
|
|
401
|
-
|
402
|
-
end
|
397
|
+
raise MARC::RecordException, record unless record.valid?
|
403
398
|
|
404
|
-
|
399
|
+
record
|
400
|
+
end
|
401
|
+
|
402
|
+
# input passed in probably has 'binary' encoding.
|
405
403
|
# We'll set it to the proper encoding, and depending on settings, optionally
|
406
404
|
# * check for valid encoding
|
407
405
|
# * raise if not valid
|
@@ -411,16 +409,16 @@ module MARC
|
|
411
409
|
# Special case for encoding "MARC-8" -- will be transcoded to
|
412
410
|
# UTF-8 (then further transcoded to external_encoding, if set).
|
413
411
|
# For "MARC-8", validate_encoding is always true, there's no way to
|
414
|
-
# ignore bad bytes.
|
412
|
+
# ignore bad bytes.
|
415
413
|
#
|
416
414
|
# Params options:
|
417
|
-
#
|
418
|
-
# * external_encoding: what encoding the input is expected to be in
|
415
|
+
#
|
416
|
+
# * external_encoding: what encoding the input is expected to be in
|
419
417
|
# * validate_encoding: if true, will raise if an invalid encoding
|
420
418
|
# * invalid: if set to :replace, will replace bad bytes with replacement
|
421
|
-
# chars instead of raising.
|
419
|
+
# chars instead of raising.
|
422
420
|
# * replace: Set replacement char for use with 'invalid', otherwise defaults
|
423
|
-
# to unicode replacement char, or question mark.
|
421
|
+
# to unicode replacement char, or question mark.
|
424
422
|
def self.set_encoding(str, params)
|
425
423
|
if str.respond_to?(:force_encoding)
|
426
424
|
if params[:external_encoding]
|
@@ -430,41 +428,38 @@ module MARC
|
|
430
428
|
else
|
431
429
|
str = str.force_encoding(params[:external_encoding])
|
432
430
|
end
|
433
|
-
end
|
434
|
-
|
431
|
+
end
|
432
|
+
|
435
433
|
# If we're transcoding anyway, pass our invalid/replace options
|
436
434
|
# on to String#encode, which will take care of them -- or raise
|
437
|
-
# with illegal bytes without :replace=>:invalid.
|
435
|
+
# with illegal bytes without :replace=>:invalid.
|
438
436
|
#
|
439
437
|
# If we're NOT transcoding, we need to use our own pure-ruby
|
440
438
|
# implementation to do invalid byte replacements. OR to raise
|
441
439
|
# a predicatable exception iff :validate_encoding, otherwise
|
442
440
|
# for performance we won't check, and you may or may not
|
443
441
|
# get an exception from inside ruby-marc, and it may change
|
444
|
-
# in future implementations.
|
442
|
+
# in future implementations.
|
445
443
|
if params[:internal_encoding]
|
446
|
-
if RUBY_VERSION >=
|
447
|
-
str
|
444
|
+
str = if RUBY_VERSION >= "3.0"
|
445
|
+
str.encode(params[:internal_encoding], **params)
|
448
446
|
else
|
449
|
-
str
|
447
|
+
str.encode(params[:internal_encoding], params)
|
450
448
|
end
|
451
|
-
elsif
|
449
|
+
elsif params[:invalid] || params[:replace] || (params[:validate_encoding] == true)
|
452
450
|
|
453
|
-
if params[:validate_encoding] == true && !
|
454
|
-
raise
|
451
|
+
if params[:validate_encoding] == true && !str.valid_encoding?
|
452
|
+
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
455
453
|
end
|
456
454
|
if params[:invalid] == :replace
|
457
455
|
str = str.scrub(params[:replace])
|
458
456
|
end
|
459
|
-
|
460
|
-
end
|
461
|
-
end
|
462
|
-
return str
|
463
|
-
end
|
464
|
-
end
|
465
|
-
|
466
|
-
|
467
457
|
|
458
|
+
end
|
459
|
+
end
|
460
|
+
str
|
461
|
+
end
|
462
|
+
end
|
468
463
|
|
469
464
|
# Like Reader ForgivingReader lets you read in a batch of MARC21 records
|
470
465
|
# but it does not use record lengths and field byte offets found in the
|
@@ -479,22 +474,19 @@ module MARC
|
|
479
474
|
#
|
480
475
|
# **NOTE**: ForgivingReader _may_ have unpredictable results when used
|
481
476
|
# with marc records with char encoding other than system default (usually
|
482
|
-
# UTF8), _especially_ if you have Encoding.default_internal set.
|
477
|
+
# UTF8), _especially_ if you have Encoding.default_internal set.
|
483
478
|
#
|
484
479
|
# Implemented a sub-class of Reader over-riding #each, so we still
|
485
480
|
# get DRY Reader's #initialize with proper char encoding options
|
486
|
-
# and handling.
|
481
|
+
# and handling.
|
487
482
|
class ForgivingReader < Reader
|
488
|
-
|
489
483
|
def each
|
490
484
|
@handle.each_line(END_OF_RECORD) do |raw|
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
# TODO add logging
|
497
|
-
end
|
485
|
+
record = MARC::Reader.decode(raw, @encoding_options.merge(forgiving: true))
|
486
|
+
yield record
|
487
|
+
rescue
|
488
|
+
# caught exception just keep barrelling along
|
489
|
+
# TODO add logging
|
498
490
|
end
|
499
491
|
end
|
500
492
|
end
|