marc 1.0.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +106 -29
- data/Gemfile +15 -0
- data/README.md +240 -47
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +119 -121
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -81
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +99 -87
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +94 -9
data/lib/marc/reader.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
require
|
1
|
+
require "scrub_rb"
|
2
2
|
|
3
3
|
# Note: requiring 'marc/marc8/to_unicode' below, in #initialize,
|
4
4
|
# only when necessary
|
5
5
|
|
6
6
|
module MARC
|
7
|
-
# A class for reading MARC binary (ISO 2709) files.
|
7
|
+
# A class for reading MARC binary (ISO 2709) files.
|
8
8
|
#
|
9
9
|
# == Character Encoding
|
10
10
|
#
|
@@ -12,7 +12,7 @@ module MARC
|
|
12
12
|
# If illegal bytes for that character encoding are encountered in certain
|
13
13
|
# operations, ruby will raise an exception. If a String is incorrectly
|
14
14
|
# tagged with the wrong character encoding, that makes it fairly likely
|
15
|
-
# an illegal byte for the specified encoding will be encountered.
|
15
|
+
# an illegal byte for the specified encoding will be encountered.
|
16
16
|
#
|
17
17
|
# So when reading binary MARC data with the MARC::Reader, it's important
|
18
18
|
# that you let it know the expected encoding:
|
@@ -21,7 +21,7 @@ module MARC
|
|
21
21
|
#
|
22
22
|
# If you leave off 'external_encoding', it will use the ruby environment
|
23
23
|
# Encoding.default_external, which is usually UTF-8 but may depend on your
|
24
|
-
# environment.
|
24
|
+
# environment.
|
25
25
|
#
|
26
26
|
# Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
|
27
27
|
# bytes. By default MARC::Reader will leave these in the produced Strings,
|
@@ -29,58 +29,58 @@ module MARC
|
|
29
29
|
# to catch this early, and ask MARC::Reader to raise immediately on illegal
|
30
30
|
# bytes:
|
31
31
|
#
|
32
|
-
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
32
|
+
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
33
33
|
# :validate_encoding => true)
|
34
34
|
#
|
35
35
|
# Alternately, you can have MARC::Reader replace illegal bytes
|
36
36
|
# with the Unicode Replacement Character, or with a string
|
37
37
|
# of your choice (including the empty string, meaning just omit the bad bytes)
|
38
38
|
#
|
39
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
39
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
40
40
|
# :invalid => :replace)
|
41
|
-
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
41
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
42
42
|
# :invalid => :replace, :replace => "")
|
43
43
|
#
|
44
44
|
# If you supply an :external_encoding argument, MARC::Reader will
|
45
45
|
# always assume that encoding -- if you leave it off, MARC::Reader
|
46
46
|
# will use the encoding tagged on any input you pass in, such
|
47
|
-
# as Strings or File handles.
|
47
|
+
# as Strings or File handles.
|
48
48
|
#
|
49
49
|
# # marc data will have same encoding as string.encoding:
|
50
50
|
# MARC::Reader.decode( string )
|
51
51
|
#
|
52
52
|
# # Same, values will have encoding of string.encoding:
|
53
|
-
# MARC::Reader.new(StringIO.new(string))
|
53
|
+
# MARC::Reader.new(StringIO.new(string))
|
54
54
|
#
|
55
55
|
# # data values will have cp866 encoding, per external_encoding of
|
56
56
|
# # File object passed in
|
57
57
|
# MARC::Reader.new(File.new("myfile.marc", "r:cp866"))
|
58
58
|
#
|
59
59
|
# # explicitly tell MARC::Reader the encoding
|
60
|
-
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
60
|
+
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
61
61
|
#
|
62
62
|
# === MARC-8
|
63
63
|
#
|
64
64
|
# The legacy MARC-8 encoding needs to be handled differently, because
|
65
|
-
# there is no built-in support in ruby for MARC-8.
|
65
|
+
# there is no built-in support in ruby for MARC-8.
|
66
66
|
#
|
67
67
|
# You _can_ specify "MARC-8" as an external encoding. It will trigger
|
68
|
-
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
68
|
+
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
69
69
|
#
|
70
70
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
|
71
71
|
#
|
72
72
|
# For external_encoding "MARC-8", :validate_encoding is always true,
|
73
73
|
# there's no way to ignore bad bytes in MARC-8 when transcoding to
|
74
|
-
# unicode. However, just as with other encodings, the
|
74
|
+
# unicode. However, just as with other encodings, the
|
75
75
|
# `:invalid => :replace` and `:replace => "string"`
|
76
|
-
# options can be used to replace bad bytes instead of raising.
|
76
|
+
# options can be used to replace bad bytes instead of raising.
|
77
77
|
#
|
78
78
|
# If you want your MARC-8 to be transcoded internally to something
|
79
79
|
# other than UTF-8, you can use the :internal_encoding option
|
80
|
-
# which works with any encoding in MARC::Reader.
|
80
|
+
# which works with any encoding in MARC::Reader.
|
81
81
|
#
|
82
|
-
# MARC::Reader.new("marc8.mrc",
|
83
|
-
# :external_encoding => "MARC-8",
|
82
|
+
# MARC::Reader.new("marc8.mrc",
|
83
|
+
# :external_encoding => "MARC-8",
|
84
84
|
# :internal_encoding => "UTF-16LE")
|
85
85
|
#
|
86
86
|
# If you want to read in MARC-8 without transcoding, leaving the
|
@@ -90,48 +90,48 @@ module MARC
|
|
90
90
|
#
|
91
91
|
# MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
|
92
92
|
#
|
93
|
-
# Please note that MARC::Reader does _not_ currently have any facilities
|
94
|
-
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
93
|
+
# Please note that MARC::Reader does _not_ currently have any facilities
|
94
|
+
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
95
95
|
#
|
96
96
|
# === Complete Encoding Options
|
97
97
|
#
|
98
98
|
# These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
|
99
99
|
# to specify external encoding, ask for a transcode to a different
|
100
|
-
# encoding on read, or validate or replace bad bytes in source.
|
100
|
+
# encoding on read, or validate or replace bad bytes in source.
|
101
101
|
#
|
102
102
|
# [:external_encoding]
|
103
103
|
# What encoding to consider the MARC record's values to be in. This option
|
104
|
-
# takes precedence over the File handle or String argument's encodings.
|
104
|
+
# takes precedence over the File handle or String argument's encodings.
|
105
105
|
# [:internal_encoding]
|
106
106
|
# Ask MARC::Reader to transcode to this encoding in memory after reading
|
107
|
-
# the file in.
|
107
|
+
# the file in.
|
108
108
|
# [:validate_encoding]
|
109
109
|
# If you pass in `true`, MARC::Reader will promise to raise an Encoding::InvalidByteSequenceError
|
110
110
|
# if there are illegal bytes in the source for the :external_encoding. There is
|
111
111
|
# a performance penalty for this check. Without this option, an exception
|
112
|
-
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
112
|
+
# _may_ or _may not_ be raised, and whether an exception or raised (or
|
113
113
|
# what class the exception has) may change in future ruby-marc versions
|
114
|
-
# without warning.
|
114
|
+
# without warning.
|
115
115
|
# [:invalid]
|
116
116
|
# Just like String#encode, set to :replace and any bytes in source data
|
117
|
-
# illegal for the source encoding will be replaced with the unicode
|
117
|
+
# illegal for the source encoding will be replaced with the unicode
|
118
118
|
# replacement character (when in unicode encodings), or else '?'. Overrides
|
119
119
|
# :validate_encoding. This can help you sanitize your input and
|
120
|
-
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
120
|
+
# avoid ruby "invalid UTF-8 byte" exceptions later.
|
121
121
|
# [:replace]
|
122
122
|
# Just like String#encode, combine with `:invalid=>:replace`, set
|
123
123
|
# your own replacement string for invalid bytes. You may use the
|
124
|
-
# empty string to simply eliminate invalid bytes.
|
124
|
+
# empty string to simply eliminate invalid bytes.
|
125
125
|
#
|
126
126
|
# === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
|
127
127
|
#
|
128
|
-
# Be careful with using an explicit File object with the File's own
|
129
|
-
# :internal_encoding set -- it can cause ruby to transcode your data
|
130
|
-
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
128
|
+
# Be careful with using an explicit File object with the File's own
|
129
|
+
# :internal_encoding set -- it can cause ruby to transcode your data
|
130
|
+
# _before_ MARC::Reader gets it, changing the bytecount and making the
|
131
131
|
# marc record unreadable in some cases. This
|
132
132
|
# applies to Encoding.default_encoding too!
|
133
133
|
#
|
134
|
-
# # May in some cases result in unreadable marc and an exception
|
134
|
+
# # May in some cases result in unreadable marc and an exception
|
135
135
|
# MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:cp866:utf-8") )
|
136
136
|
#
|
137
137
|
# # May in some cases result in unreadable marc and an exception
|
@@ -156,7 +156,7 @@ module MARC
|
|
156
156
|
# https://jira.codehaus.org/browse/JRUBY-6637
|
157
157
|
#
|
158
158
|
# We recommend using the latest version of jruby, especially
|
159
|
-
# at least jruby 1.7.6.
|
159
|
+
# at least jruby 1.7.6.
|
160
160
|
class Reader
|
161
161
|
include Enumerable
|
162
162
|
|
@@ -182,43 +182,42 @@ module MARC
|
|
182
182
|
#
|
183
183
|
# Also, if your data encoded with non ascii/utf-8 encoding
|
184
184
|
# (for ex. when reading RUSMARC data) and you use ruby 1.9
|
185
|
-
# you can specify source data encoding with an option.
|
185
|
+
# you can specify source data encoding with an option.
|
186
186
|
#
|
187
187
|
# reader = MARC::Reader.new('marc.dat', :external_encoding => 'cp866')
|
188
188
|
#
|
189
189
|
# or, you can pass IO, opened in the corresponding encoding
|
190
190
|
#
|
191
191
|
# reader = MARC::Reader.new(File.new('marc.dat', 'r:cp866'))
|
192
|
-
def initialize(file, options = {})
|
192
|
+
def initialize(file, options = {})
|
193
193
|
@encoding_options = {}
|
194
194
|
# all can be nil
|
195
195
|
[:internal_encoding, :external_encoding, :invalid, :replace, :validate_encoding].each do |key|
|
196
196
|
@encoding_options[key] = options[key] if options.has_key?(key)
|
197
197
|
end
|
198
|
-
|
199
|
-
if file.is_a?(String)
|
198
|
+
|
199
|
+
if file.is_a?(String)
|
200
200
|
@handle = File.new(file)
|
201
|
-
elsif file.respond_to?(
|
201
|
+
elsif file.respond_to?(:read, 5)
|
202
202
|
@handle = file
|
203
203
|
else
|
204
204
|
raise ArgumentError, "must pass in path or file"
|
205
205
|
end
|
206
|
-
|
207
|
-
if (
|
206
|
+
|
207
|
+
if (!@encoding_options[:external_encoding]) && @handle.respond_to?(:external_encoding)
|
208
208
|
# use file encoding only if we didn't already have an explicit one,
|
209
|
-
# explicit one takes precedence.
|
209
|
+
# explicit one takes precedence.
|
210
210
|
#
|
211
211
|
# Note, please don't use ruby's own internal_encoding transcode
|
212
212
|
# with binary marc data, the transcode can mess up the byte count
|
213
|
-
# and make it unreadable.
|
213
|
+
# and make it unreadable.
|
214
214
|
@encoding_options[:external_encoding] ||= @handle.external_encoding
|
215
215
|
end
|
216
216
|
|
217
217
|
# Only pull in the MARC8 translation if we need it, since it's really big
|
218
|
-
if @encoding_options[:external_encoding]
|
219
|
-
require
|
218
|
+
if @encoding_options[:external_encoding] == "MARC-8"
|
219
|
+
require "marc/marc8/to_unicode" unless defined? MARC::Marc8::ToUnicode
|
220
220
|
end
|
221
|
-
|
222
221
|
end
|
223
222
|
|
224
223
|
# to support iteration:
|
@@ -226,13 +225,13 @@ module MARC
|
|
226
225
|
# print record
|
227
226
|
# end
|
228
227
|
def each
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
self.each_raw do |raw|
|
233
|
-
record = self.decode(raw)
|
228
|
+
if block_given?
|
229
|
+
each_raw do |raw|
|
230
|
+
record = decode(raw)
|
234
231
|
yield record
|
235
232
|
end
|
233
|
+
else
|
234
|
+
enum_for(:each)
|
236
235
|
end
|
237
236
|
end
|
238
237
|
|
@@ -257,10 +256,8 @@ module MARC
|
|
257
256
|
#
|
258
257
|
# If no block is given, an enumerator is returned
|
259
258
|
def each_raw
|
260
|
-
|
261
|
-
|
262
|
-
else
|
263
|
-
while rec_length_s = @handle.read(5)
|
259
|
+
if block_given?
|
260
|
+
while (rec_length_s = @handle.read(5))
|
264
261
|
# make sure the record length looks like an integer
|
265
262
|
rec_length_i = rec_length_s.to_i
|
266
263
|
if rec_length_i == 0
|
@@ -269,9 +266,11 @@ module MARC
|
|
269
266
|
|
270
267
|
# get the raw MARC21 for a record back from the file
|
271
268
|
# using the record length
|
272
|
-
raw = rec_length_s + @handle.read(rec_length_i-5)
|
269
|
+
raw = rec_length_s + @handle.read(rec_length_i - 5)
|
273
270
|
yield raw
|
274
271
|
end
|
272
|
+
else
|
273
|
+
enum_for(:each_raw)
|
275
274
|
end
|
276
275
|
end
|
277
276
|
|
@@ -280,7 +279,7 @@ module MARC
|
|
280
279
|
# Wraps the class method MARC::Reader::decode, using the encoding options of
|
281
280
|
# the MARC::Reader instance.
|
282
281
|
def decode(marc)
|
283
|
-
|
282
|
+
MARC::Reader.decode(marc, @encoding_options)
|
284
283
|
end
|
285
284
|
|
286
285
|
# A static method for turning raw MARC data in transission
|
@@ -288,34 +287,34 @@ module MARC
|
|
288
287
|
# First argument is a String
|
289
288
|
# options include:
|
290
289
|
# [:external_encoding] encoding of MARC record data values
|
291
|
-
# [:forgiving] needs more docs, true is some kind of forgiving
|
292
|
-
# of certain kinds of bad MARC.
|
293
|
-
def self.decode(marc, params={})
|
290
|
+
# [:forgiving] needs more docs, true is some kind of forgiving
|
291
|
+
# of certain kinds of bad MARC.
|
292
|
+
def self.decode(marc, params = {})
|
294
293
|
if params.has_key?(:encoding)
|
295
|
-
|
294
|
+
warn "DEPRECATION WARNING: MARC::Reader.decode :encoding option deprecated, please use :external_encoding"
|
296
295
|
params[:external_encoding] = params.delete(:encoding)
|
297
296
|
end
|
298
|
-
|
299
|
-
if (!
|
297
|
+
|
298
|
+
if (!params.has_key? :external_encoding) && marc.respond_to?(:encoding)
|
300
299
|
# If no forced external_encoding giving, respect the encoding
|
301
|
-
# declared on the string passed in.
|
300
|
+
# declared on the string passed in.
|
302
301
|
params[:external_encoding] = marc.encoding
|
303
302
|
end
|
304
303
|
# And now that we've recorded the current encoding, we force
|
305
304
|
# to binary encoding, because we're going to be doing byte arithmetic,
|
306
|
-
# and want to avoid byte-vs-char confusion.
|
305
|
+
# and want to avoid byte-vs-char confusion.
|
307
306
|
marc.force_encoding("binary") if marc.respond_to?(:force_encoding)
|
308
|
-
|
309
|
-
record = Record.new
|
310
|
-
record.leader = marc[0..LEADER_LENGTH-1]
|
307
|
+
|
308
|
+
record = Record.new
|
309
|
+
record.leader = marc[0..LEADER_LENGTH - 1]
|
311
310
|
|
312
311
|
# where the field data starts
|
313
312
|
base_address = record.leader[12..16].to_i
|
314
313
|
|
315
314
|
# get the byte offsets from the record directory
|
316
|
-
directory = marc[LEADER_LENGTH..base_address-1]
|
315
|
+
directory = marc[LEADER_LENGTH..base_address - 1]
|
317
316
|
|
318
|
-
raise MARC::Exception.new("invalid directory in record") if directory
|
317
|
+
raise MARC::Exception.new("invalid directory in record") if directory.nil?
|
319
318
|
|
320
319
|
# the number of fields in the record corresponds to
|
321
320
|
# how many directory entries there are
|
@@ -324,20 +323,19 @@ module MARC
|
|
324
323
|
# when operating in forgiving mode we just split on end of
|
325
324
|
# field instead of using calculated byte offsets from the
|
326
325
|
# directory
|
327
|
-
if params[:forgiving]
|
326
|
+
if params[:forgiving]
|
328
327
|
marc_field_data = marc[base_address..-1]
|
329
328
|
# It won't let us do the split on bad utf8 data, but
|
330
329
|
# we haven't yet set the 'proper' encoding or used
|
331
330
|
# our correction/replace options. So call it binary for now.
|
332
331
|
marc_field_data.force_encoding("binary") if marc_field_data.respond_to?(:force_encoding)
|
333
|
-
|
332
|
+
|
334
333
|
all_fields = marc_field_data.split(END_OF_FIELD)
|
335
334
|
else
|
336
|
-
mba =
|
335
|
+
mba = marc.bytes.to_a
|
337
336
|
end
|
338
337
|
|
339
|
-
0.upto(num_fields-1) do |field_num|
|
340
|
-
|
338
|
+
0.upto(num_fields - 1) do |field_num|
|
341
339
|
# pull the directory entry for a field out
|
342
340
|
entry_start = field_num * DIRECTORY_ENTRY_LENGTH
|
343
341
|
entry_end = entry_start + DIRECTORY_ENTRY_LENGTH
|
@@ -350,12 +348,12 @@ module MARC
|
|
350
348
|
# if we were told to be forgiving we just use the
|
351
349
|
# next available chuck of field data that we
|
352
350
|
# split apart based on the END_OF_FIELD
|
353
|
-
field_data =
|
351
|
+
field_data = ""
|
354
352
|
if params[:forgiving]
|
355
|
-
field_data = all_fields.shift
|
353
|
+
field_data = all_fields.shift
|
356
354
|
|
357
|
-
|
358
|
-
|
355
|
+
# otherwise we actually use the byte offsets in
|
356
|
+
# directory to figure out what field data to extract
|
359
357
|
else
|
360
358
|
length = entry[3..6].to_i
|
361
359
|
offset = entry[7..11].to_i
|
@@ -366,11 +364,11 @@ module MARC
|
|
366
364
|
|
367
365
|
# remove end of field
|
368
366
|
field_data.delete!(END_OF_FIELD)
|
369
|
-
|
367
|
+
|
370
368
|
# add a control field or data field
|
371
369
|
if MARC::ControlField.control_tag?(tag)
|
372
|
-
field_data = MARC::Reader.set_encoding(
|
373
|
-
record.append(MARC::ControlField.new(tag,field_data))
|
370
|
+
field_data = MARC::Reader.set_encoding(field_data, params)
|
371
|
+
record.append(MARC::ControlField.new(tag, field_data))
|
374
372
|
else
|
375
373
|
field = MARC::DataField.new(tag)
|
376
374
|
|
@@ -379,17 +377,17 @@ module MARC
|
|
379
377
|
|
380
378
|
# must have at least 2 elements (indicators, and 1 subfield)
|
381
379
|
# TODO some sort of logging?
|
382
|
-
next if subfields.length
|
380
|
+
next if subfields.length < 2
|
383
381
|
|
384
382
|
# get indicators
|
385
|
-
indicators = MARC::Reader.set_encoding(
|
386
|
-
field.indicator1 = indicators[0,1]
|
387
|
-
field.indicator2 = indicators[1,1]
|
383
|
+
indicators = MARC::Reader.set_encoding(subfields.shift, params)
|
384
|
+
field.indicator1 = indicators[0, 1]
|
385
|
+
field.indicator2 = indicators[1, 1]
|
388
386
|
|
389
387
|
# add each subfield to the field
|
390
|
-
subfields.each
|
391
|
-
data = MARC::Reader.set_encoding(
|
392
|
-
subfield = MARC::Subfield.new(data[0,1],data[1..-1])
|
388
|
+
subfields.each do |data|
|
389
|
+
data = MARC::Reader.set_encoding(data, params)
|
390
|
+
subfield = MARC::Subfield.new(data[0, 1], data[1..-1])
|
393
391
|
field.append(subfield)
|
394
392
|
end
|
395
393
|
|
@@ -398,10 +396,12 @@ module MARC
|
|
398
396
|
end
|
399
397
|
end
|
400
398
|
|
401
|
-
|
402
|
-
|
399
|
+
raise MARC::RecordException, record unless record.valid?
|
400
|
+
|
401
|
+
record
|
402
|
+
end
|
403
403
|
|
404
|
-
# input passed in probably has 'binary' encoding.
|
404
|
+
# input passed in probably has 'binary' encoding.
|
405
405
|
# We'll set it to the proper encoding, and depending on settings, optionally
|
406
406
|
# * check for valid encoding
|
407
407
|
# * raise if not valid
|
@@ -411,16 +411,16 @@ module MARC
|
|
411
411
|
# Special case for encoding "MARC-8" -- will be transcoded to
|
412
412
|
# UTF-8 (then further transcoded to external_encoding, if set).
|
413
413
|
# For "MARC-8", validate_encoding is always true, there's no way to
|
414
|
-
# ignore bad bytes.
|
414
|
+
# ignore bad bytes.
|
415
415
|
#
|
416
416
|
# Params options:
|
417
|
-
#
|
418
|
-
# * external_encoding: what encoding the input is expected to be in
|
417
|
+
#
|
418
|
+
# * external_encoding: what encoding the input is expected to be in
|
419
419
|
# * validate_encoding: if true, will raise if an invalid encoding
|
420
420
|
# * invalid: if set to :replace, will replace bad bytes with replacement
|
421
|
-
# chars instead of raising.
|
421
|
+
# chars instead of raising.
|
422
422
|
# * replace: Set replacement char for use with 'invalid', otherwise defaults
|
423
|
-
# to unicode replacement char, or question mark.
|
423
|
+
# to unicode replacement char, or question mark.
|
424
424
|
def self.set_encoding(str, params)
|
425
425
|
if str.respond_to?(:force_encoding)
|
426
426
|
if params[:external_encoding]
|
@@ -430,37 +430,38 @@ module MARC
|
|
430
430
|
else
|
431
431
|
str = str.force_encoding(params[:external_encoding])
|
432
432
|
end
|
433
|
-
end
|
434
|
-
|
433
|
+
end
|
434
|
+
|
435
435
|
# If we're transcoding anyway, pass our invalid/replace options
|
436
436
|
# on to String#encode, which will take care of them -- or raise
|
437
|
-
# with illegal bytes without :replace=>:invalid.
|
437
|
+
# with illegal bytes without :replace=>:invalid.
|
438
438
|
#
|
439
439
|
# If we're NOT transcoding, we need to use our own pure-ruby
|
440
440
|
# implementation to do invalid byte replacements. OR to raise
|
441
441
|
# a predicatable exception iff :validate_encoding, otherwise
|
442
442
|
# for performance we won't check, and you may or may not
|
443
443
|
# get an exception from inside ruby-marc, and it may change
|
444
|
-
# in future implementations.
|
444
|
+
# in future implementations.
|
445
445
|
if params[:internal_encoding]
|
446
|
-
str =
|
447
|
-
|
446
|
+
str = if RUBY_VERSION >= "3.0"
|
447
|
+
str.encode(params[:internal_encoding], **params)
|
448
|
+
else
|
449
|
+
str.encode(params[:internal_encoding], params)
|
450
|
+
end
|
451
|
+
elsif params[:invalid] || params[:replace] || (params[:validate_encoding] == true)
|
448
452
|
|
449
|
-
if params[:validate_encoding] == true && !
|
450
|
-
raise
|
453
|
+
if params[:validate_encoding] == true && !str.valid_encoding?
|
454
|
+
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
451
455
|
end
|
452
456
|
if params[:invalid] == :replace
|
453
457
|
str = str.scrub(params[:replace])
|
454
458
|
end
|
455
|
-
|
456
|
-
end
|
457
|
-
end
|
458
|
-
return str
|
459
|
-
end
|
460
|
-
end
|
461
|
-
|
462
|
-
|
463
459
|
|
460
|
+
end
|
461
|
+
end
|
462
|
+
str
|
463
|
+
end
|
464
|
+
end
|
464
465
|
|
465
466
|
# Like Reader ForgivingReader lets you read in a batch of MARC21 records
|
466
467
|
# but it does not use record lengths and field byte offets found in the
|
@@ -475,22 +476,19 @@ module MARC
|
|
475
476
|
#
|
476
477
|
# **NOTE**: ForgivingReader _may_ have unpredictable results when used
|
477
478
|
# with marc records with char encoding other than system default (usually
|
478
|
-
# UTF8), _especially_ if you have Encoding.default_internal set.
|
479
|
+
# UTF8), _especially_ if you have Encoding.default_internal set.
|
479
480
|
#
|
480
481
|
# Implemented a sub-class of Reader over-riding #each, so we still
|
481
482
|
# get DRY Reader's #initialize with proper char encoding options
|
482
|
-
# and handling.
|
483
|
+
# and handling.
|
483
484
|
class ForgivingReader < Reader
|
484
|
-
|
485
485
|
def each
|
486
486
|
@handle.each_line(END_OF_RECORD) do |raw|
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
# TODO add logging
|
493
|
-
end
|
487
|
+
record = MARC::Reader.decode(raw, @encoding_options.merge(forgiving: true))
|
488
|
+
yield record
|
489
|
+
rescue
|
490
|
+
# caught exception just keep barrelling along
|
491
|
+
# TODO add logging
|
494
492
|
end
|
495
493
|
end
|
496
494
|
end
|