marc 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -18
- data/Rakefile +3 -4
- data/lib/marc/marc8/map_to_unicode.rb +16458 -0
- data/lib/marc/marc8/to_unicode.rb +198 -0
- data/lib/marc/reader.rb +133 -112
- data/lib/marc/version.rb +1 -1
- data/test/bad_eacc_encoding.marc8.marc +1 -0
- data/test/bib178448.okay.human +24 -0
- data/test/bib178448.okay.marc +1 -0
- data/test/bib178448.writtenout.marc +1 -0
- data/test/escaped_character_reference.marc8.marc +1 -0
- data/test/marc8/data/test_marc8.txt +1514 -0
- data/test/marc8/data/test_utf8.txt +1514 -0
- data/test/marc8/tc_marc8_mapping.rb +11 -0
- data/test/marc8/tc_to_unicode.rb +154 -0
- data/test/marc_with_bad_utf8.utf8.human +40 -0
- data/test/marc_with_bad_utf8.utf8.marc +1 -0
- data/test/tc_reader_char_encodings.rb +92 -5
- metadata +61 -15
- data/test/tc_weird_jruby_bytes.rb +0 -62
@@ -0,0 +1,198 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'marc'
|
4
|
+
require 'marc/marc8/map_to_unicode'
|
5
|
+
require 'unf/normalizer'
|
6
|
+
|
7
|
+
module MARC
|
8
|
+
module Marc8
|
9
|
+
# Class to convert Marc8 to UTF-8. NOTE: Requires ruby 1.9+ (this could be
|
10
|
+
# changed without too much trouble, but we just don't care to support 1.8.7 anymore.)
|
11
|
+
#
|
12
|
+
# http://www.loc.gov/marc/specifications/speccharmarc8.html
|
13
|
+
#
|
14
|
+
# NOT thread-safe, it needs to keep state as it goes through a string,
|
15
|
+
# do not re-use between threads.
|
16
|
+
#
|
17
|
+
# Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
|
18
|
+
#
|
19
|
+
# Returns UTF-8 encoded string! Encode to something else if you want
|
20
|
+
# something else.
|
21
|
+
#
|
22
|
+
# III proprietary code points?
|
23
|
+
class ToUnicode
|
24
|
+
BASIC_LATIN = 0x42
|
25
|
+
ANSEL = 0x45
|
26
|
+
|
27
|
+
G0_SET = ['(', ',', '$']
|
28
|
+
G1_SET = [')', '-', '$']
|
29
|
+
|
30
|
+
CODESETS = MARC::Marc8::MapToUnicode::CODESETS
|
31
|
+
|
32
|
+
# These are state flags, MARC8 requires you to keep
|
33
|
+
# track of 'current char sets' or something like that, which
|
34
|
+
# are changed with escape codes, or something like that.
|
35
|
+
attr_accessor :g0, :g1
|
36
|
+
|
37
|
+
def initialize
|
38
|
+
self.g0 = BASIC_LATIN
|
39
|
+
self.g1 = ANSEL
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns UTF-8 encoded string equivalent of marc8_string passed in.
|
43
|
+
#
|
44
|
+
# Bad Marc8 bytes? By default will raise an Encoding::InvalidByteSequenceError
|
45
|
+
# (will not have full metadata filled out, but will have a decent error message)
|
46
|
+
#
|
47
|
+
# Set option :invalid => :replace to instead silently replace bad bytes
|
48
|
+
# with a replacement char -- by default Unicode Replacement Char, but can set
|
49
|
+
# option :replace to something else, including empty string.
|
50
|
+
#
|
51
|
+
# converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
|
52
|
+
#
|
53
|
+
# By default returns NFC normalized, but set :normalization option to:
|
54
|
+
# :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
|
55
|
+
# we won't do any normalization just take it as it comes out of the
|
56
|
+
# transcode algorithm. This will generally NOT be composed.
|
57
|
+
#
|
58
|
+
# By default, escaped unicode 'named character references' in Marc8 will
|
59
|
+
# be translated to actual UTF8. Eg. "‏" But pass :expand_ncr => false
|
60
|
+
# to disable. http://www.loc.gov/marc/specifications/speccharconversion.html#lossless
|
61
|
+
#
|
62
|
+
# String arg passed in WILL have it's encoding tagged 'binary' if
|
63
|
+
# it's not already, if it's Marc8 there's no good reason for it not to
|
64
|
+
# be already.
|
65
|
+
def transcode(marc8_string, options = {})
|
66
|
+
invalid_replacement = options.fetch(:replace, "\uFFFD")
|
67
|
+
expand_ncr = options.fetch(:expand_ncr, true)
|
68
|
+
normalization = options.fetch(:normalization, :nfc)
|
69
|
+
|
70
|
+
|
71
|
+
# don't choke on empty marc8_string
|
72
|
+
return "" if marc8_string.nil? || marc8_string.empty?
|
73
|
+
|
74
|
+
# Make sure to call it 'binary', so we can slice it
|
75
|
+
# byte by byte, and so ruby doesn't complain about bad
|
76
|
+
# bytes for some other encoding. Yeah, we're changing
|
77
|
+
# encoding on input! If it's Marc8, it ought to be tagged
|
78
|
+
# binary already.
|
79
|
+
marc8_string.force_encoding("binary")
|
80
|
+
|
81
|
+
uni_list = []
|
82
|
+
combinings = []
|
83
|
+
pos = 0
|
84
|
+
while pos < marc8_string.length
|
85
|
+
if marc8_string[pos] == "\x1b"
|
86
|
+
next_byte = marc8_string[pos+1]
|
87
|
+
if G0_SET.include? next_byte
|
88
|
+
if marc8_string.length >= pos + 3
|
89
|
+
if marc8_string[pos+2] == ',' and next_byte == '$'
|
90
|
+
pos += 1
|
91
|
+
end
|
92
|
+
self.g0 = marc8_string[pos+2].ord
|
93
|
+
pos = pos + 3
|
94
|
+
next
|
95
|
+
else
|
96
|
+
# if there aren't enough remaining characters, readd
|
97
|
+
# the escape character so it doesn't get lost; may
|
98
|
+
# help users diagnose problem records
|
99
|
+
uni_list.push marc8_string[pos]
|
100
|
+
pos += 1
|
101
|
+
next
|
102
|
+
end
|
103
|
+
|
104
|
+
elsif G1_SET.include? next_byte
|
105
|
+
if marc8_string[pos+2] == '-' and next_byte == '$'
|
106
|
+
pos += 1
|
107
|
+
end
|
108
|
+
self.g1 = marc8_string[pos+2].ord
|
109
|
+
pos = pos + 3
|
110
|
+
next
|
111
|
+
else
|
112
|
+
charset = next_byte.ord
|
113
|
+
if CODESETS.has_key? charset
|
114
|
+
self.g0 = charset
|
115
|
+
pos += 2
|
116
|
+
elsif charset == 0x73
|
117
|
+
self.g0 = BASIC_LATIN
|
118
|
+
pos += 2
|
119
|
+
if pos == marc8_string.length
|
120
|
+
break
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
mb_flag = is_multibyte(self.g0)
|
127
|
+
|
128
|
+
if mb_flag
|
129
|
+
code_point = (marc8_string[pos].ord * 65536 +
|
130
|
+
marc8_string[pos+1].ord * 256 +
|
131
|
+
marc8_string[pos+2].ord)
|
132
|
+
pos += 3
|
133
|
+
else
|
134
|
+
code_point = marc8_string[pos].ord
|
135
|
+
pos += 1
|
136
|
+
end
|
137
|
+
|
138
|
+
if (code_point < 0x20 or
|
139
|
+
(code_point > 0x80 and code_point < 0xa0))
|
140
|
+
uni = unichr(code_point)
|
141
|
+
next
|
142
|
+
end
|
143
|
+
|
144
|
+
begin
|
145
|
+
code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
|
146
|
+
(uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
|
147
|
+
|
148
|
+
if cflag
|
149
|
+
combinings.push unichr(uni)
|
150
|
+
else
|
151
|
+
uni_list.push unichr(uni)
|
152
|
+
if combinings.length > 0
|
153
|
+
uni_list.concat combinings
|
154
|
+
combinings = []
|
155
|
+
end
|
156
|
+
end
|
157
|
+
rescue KeyError
|
158
|
+
if options[:invalid] == :replace
|
159
|
+
# Let's coallesece multiple replacements
|
160
|
+
uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
|
161
|
+
pos += 1
|
162
|
+
else
|
163
|
+
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
# what to do if combining chars left over?
|
169
|
+
uni_str = uni_list.join('')
|
170
|
+
|
171
|
+
if expand_ncr
|
172
|
+
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
173
|
+
[$1.hex].pack("U")
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
if normalization
|
178
|
+
uni_str = UNF::Normalizer.normalize(uni_str, normalization)
|
179
|
+
end
|
180
|
+
|
181
|
+
return uni_str
|
182
|
+
end
|
183
|
+
|
184
|
+
# from the original python, yeah, apparently
|
185
|
+
# only one charset is considered multibyte
|
186
|
+
def is_multibyte(charset)
|
187
|
+
charset == 0x31
|
188
|
+
end
|
189
|
+
|
190
|
+
# input single unicode codepoint as integer; output encoded as a UTF-8 string
|
191
|
+
# python has unichr built-in, we just define it for convenience no problem.
|
192
|
+
def unichr(code_point)
|
193
|
+
[code_point].pack("U")
|
194
|
+
end
|
195
|
+
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
data/lib/marc/reader.rb
CHANGED
@@ -1,35 +1,48 @@
|
|
1
|
+
require 'ensure_valid_encoding'
|
2
|
+
require 'marc/marc8/to_unicode'
|
3
|
+
|
1
4
|
module MARC
|
2
5
|
# A class for reading MARC binary (ISO 2709) files.
|
3
6
|
#
|
4
7
|
# == Character Encoding
|
5
8
|
#
|
6
|
-
# In ruby 1.
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
9
|
+
# In ruby 1.9+, ruby tags all strings with expected character encodings.
|
10
|
+
# If illegal bytes for that character encoding are encountered in certain
|
11
|
+
# operations, ruby will raise an exception. If a String is incorrectly
|
12
|
+
# tagged with the wrong character encoding, that makes it fairly likely
|
13
|
+
# an illegal byte for the specified encoding will be encountered.
|
14
|
+
#
|
15
|
+
# So when reading binary MARC data with the MARC::Reader, it's important
|
16
|
+
# that you let it know the expected encoding:
|
17
|
+
#
|
18
|
+
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8")
|
19
|
+
#
|
20
|
+
# If you leave off 'external_encoding', it will use the ruby environment
|
21
|
+
# Encoding.default_external, which is usually UTF-8 but may depend on your
|
22
|
+
# environment.
|
23
|
+
#
|
24
|
+
# Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
|
25
|
+
# bytes. By default MARC::Reader will leave these in the produced Strings,
|
26
|
+
# which will probably raise an exception later in your program. Better
|
27
|
+
# to catch this early, and ask MARC::Reader to raise immediately on illegal
|
28
|
+
# bytes:
|
29
|
+
#
|
30
|
+
# MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
|
31
|
+
# :validate_encoding => true)
|
32
|
+
#
|
33
|
+
# Alternately, you can have MARC::Reader replace illegal bytes
|
34
|
+
# with the Unicode Replacement Character, or with a string
|
35
|
+
# of your choice (including the empty string, meaning just omit the bad bytes)
|
36
|
+
#
|
37
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
38
|
+
# :invalid => :replace)
|
39
|
+
# MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
|
40
|
+
# :invalid => :replace, :replace => "")
|
41
|
+
#
|
42
|
+
# If you supply an :external_encoding argument, MARC::Reader will
|
43
|
+
# always assume that encoding -- if you leave it off, MARC::Reader
|
44
|
+
# will use the encoding tagged on any input you pass in, such
|
45
|
+
# as Strings or File handles.
|
33
46
|
#
|
34
47
|
# # marc data will have same encoding as string.encoding:
|
35
48
|
# MARC::Reader.decode( string )
|
@@ -44,17 +57,42 @@ module MARC
|
|
44
57
|
# # explicitly tell MARC::Reader the encoding
|
45
58
|
# MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
|
46
59
|
#
|
47
|
-
#
|
48
|
-
#
|
49
|
-
#
|
50
|
-
#
|
51
|
-
#
|
60
|
+
# === MARC-8
|
61
|
+
#
|
62
|
+
# The legacy MARC-8 encoding needs to be handled differently, because
|
63
|
+
# there is no built-in support in ruby for MARC-8.
|
64
|
+
#
|
65
|
+
# You _can_ specify "MARC-8" as an external encoding. It will trigger
|
66
|
+
# trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
|
67
|
+
#
|
68
|
+
# MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
|
69
|
+
#
|
70
|
+
# For external_encoding "MARC-8", :validate_encoding is always true,
|
71
|
+
# there's no way to ignore bad bytes in MARC-8 when transcoding to
|
72
|
+
# unicode. However, just as with other encodings, the
|
73
|
+
# `:invalid => :replace` and `:replace => "string"`
|
74
|
+
# options can be used to replace bad bytes instead of raising.
|
75
|
+
#
|
76
|
+
# If you want your MARC-8 to be transcoded internally to something
|
77
|
+
# other than UTF-8, you can use the :internal_encoding option
|
78
|
+
# which works with any encoding in MARC::Reader.
|
79
|
+
#
|
80
|
+
# MARC::Reader.new("marc8.mrc",
|
81
|
+
# :external_encoding => "MARC-8",
|
82
|
+
# :internal_encoding => "UTF-16LE")
|
83
|
+
#
|
84
|
+
# If you want to read in MARC-8 without transcoding, leaving the
|
85
|
+
# internal Strings in MARC-8, the only way to do that is with
|
86
|
+
# ruby's 'binary' (aka "ASCII-8BIT") encoding, since ruby doesn't
|
87
|
+
# know from MARC-8. This will work:
|
88
|
+
#
|
89
|
+
# MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
|
52
90
|
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
#
|
91
|
+
# Please note that MARC::Reader does _not_ currently have any facilities
|
92
|
+
# for guessing encoding from MARC21 leader byte 9, that is ignored.
|
93
|
+
#
|
94
|
+
# === Complete Encoding Options
|
56
95
|
#
|
57
|
-
# == Additional Options
|
58
96
|
# These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
|
59
97
|
# to specify external encoding, ask for a transcode to a different
|
60
98
|
# encoding on read, or validate or replace bad bytes in source.
|
@@ -83,7 +121,7 @@ module MARC
|
|
83
121
|
# your own replacement string for invalid bytes. You may use the
|
84
122
|
# empty string to simply eliminate invalid bytes.
|
85
123
|
#
|
86
|
-
#
|
124
|
+
# === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
|
87
125
|
#
|
88
126
|
# Be careful with using an explicit File object with the File's own
|
89
127
|
# :internal_encoding set -- it can cause ruby to transcode your data
|
@@ -109,11 +147,14 @@ module MARC
|
|
109
147
|
# MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:binary:binary"),
|
110
148
|
# :external_encoding => "cp866",
|
111
149
|
# :internal_encoding => "utf-8")
|
112
|
-
#
|
113
|
-
#
|
114
|
-
#
|
115
|
-
#
|
150
|
+
#
|
151
|
+
# === jruby note
|
152
|
+
# In the past, jruby encoding-related bugs have caused problems with
|
153
|
+
# our encoding treatments. See for example:
|
116
154
|
# https://jira.codehaus.org/browse/JRUBY-6637
|
155
|
+
#
|
156
|
+
# We recommend using the latest version of jruby, especially
|
157
|
+
# at least jruby 1.7.6.
|
117
158
|
class Reader
|
118
159
|
include Enumerable
|
119
160
|
|
@@ -284,31 +325,10 @@ module MARC
|
|
284
325
|
|
285
326
|
# remove end of field
|
286
327
|
field_data.delete!(END_OF_FIELD)
|
287
|
-
|
288
|
-
if field_data.respond_to?(:force_encoding)
|
289
|
-
if params[:external_encoding]
|
290
|
-
field_data = field_data.force_encoding(params[:external_encoding])
|
291
|
-
end
|
292
|
-
|
293
|
-
# If we're transcoding anyway, pass our invalid/replace options
|
294
|
-
# on to String#encode, which will take care of them -- or raise
|
295
|
-
# with illegal bytes without :replace=>:invalid.
|
296
|
-
#
|
297
|
-
# If we're NOT transcoding, we need to use our own pure-ruby
|
298
|
-
# implementation to do invalid byte replacements. OR to raise
|
299
|
-
# a predicatable exception iff :validate_encoding, otherwise
|
300
|
-
# for performance we won't check, and you may or may not
|
301
|
-
# get an exception from inside ruby-marc, and it may change
|
302
|
-
# in future implementations.
|
303
|
-
if params[:internal_encoding]
|
304
|
-
field_data = field_data.encode(params[:internal_encoding], params)
|
305
|
-
elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
|
306
|
-
field_data = MARC::Reader.validate_encoding(field_data, params)
|
307
|
-
end
|
308
|
-
|
309
|
-
end
|
328
|
+
|
310
329
|
# add a control field or data field
|
311
330
|
if MARC::ControlField.control_tag?(tag)
|
331
|
+
field_data = MARC::Reader.set_encoding( field_data , params)
|
312
332
|
record.append(MARC::ControlField.new(tag,field_data))
|
313
333
|
else
|
314
334
|
field = MARC::DataField.new(tag)
|
@@ -321,12 +341,13 @@ module MARC
|
|
321
341
|
next if subfields.length() < 2
|
322
342
|
|
323
343
|
# get indicators
|
324
|
-
indicators = subfields.shift()
|
344
|
+
indicators = MARC::Reader.set_encoding( subfields.shift(), params)
|
325
345
|
field.indicator1 = indicators[0,1]
|
326
346
|
field.indicator2 = indicators[1,1]
|
327
347
|
|
328
348
|
# add each subfield to the field
|
329
349
|
subfields.each() do |data|
|
350
|
+
data = MARC::Reader.set_encoding( data, params )
|
330
351
|
subfield = MARC::Subfield.new(data[0,1],data[1..-1])
|
331
352
|
field.append(subfield)
|
332
353
|
end
|
@@ -337,57 +358,57 @@ module MARC
|
|
337
358
|
end
|
338
359
|
|
339
360
|
return record
|
340
|
-
end
|
341
|
-
|
342
|
-
#
|
343
|
-
#
|
344
|
-
#
|
345
|
-
#
|
346
|
-
#
|
347
|
-
#
|
348
|
-
# with the usual error metadata, sorry.
|
361
|
+
end
|
362
|
+
|
363
|
+
# input passed in probably has 'binary' encoding.
|
364
|
+
# We'll set it to the proper encoding, and depending on settings, optionally
|
365
|
+
# * check for valid encoding
|
366
|
+
# * raise if not valid
|
367
|
+
# * or replace bad bytes with replacement chars if not valid
|
368
|
+
# * transcode from external_encoding to internal_encoding
|
349
369
|
#
|
350
|
-
#
|
351
|
-
#
|
352
|
-
#
|
353
|
-
#
|
354
|
-
# use the unicode replacement char if it thinks it's a unicode encoding,
|
355
|
-
# else ascii '?'.
|
370
|
+
# Special case for encoding "MARC-8" -- will be transcoded to
|
371
|
+
# UTF-8 (then further transcoded to external_encoding, if set).
|
372
|
+
# For "MARC-8", validate_encoding is always true, there's no way to
|
373
|
+
# ignore bad bytes.
|
356
374
|
#
|
357
|
-
#
|
358
|
-
#
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
|
372
|
-
else
|
373
|
-
# :replace => :invalid,
|
374
|
-
# actually need to go through chars to replace bad ones
|
375
|
-
return str.chars.collect do |c|
|
376
|
-
if c.valid_encoding?
|
377
|
-
c
|
375
|
+
# Params options:
|
376
|
+
#
|
377
|
+
# * external_encoding: what encoding the input is expected to be in
|
378
|
+
# * validate_encoding: if true, will raise if an invalid encoding
|
379
|
+
# * invalid: if set to :replace, will replace bad bytes with replacement
|
380
|
+
# chars instead of raising.
|
381
|
+
# * replace: Set replacement char for use with 'invalid', otherwise defaults
|
382
|
+
# to unicode replacement char, or question mark.
|
383
|
+
def self.set_encoding(str, params)
|
384
|
+
if str.respond_to?(:force_encoding)
|
385
|
+
if params[:external_encoding]
|
386
|
+
if params[:external_encoding] == "MARC-8"
|
387
|
+
transcode_params = [:invalid, :replace].each_with_object({}) { |k, hash| hash[k] = params[k] if params.has_key?(k) }
|
388
|
+
str = MARC::Marc8::ToUnicode.new.transcode(str, transcode_params)
|
378
389
|
else
|
379
|
-
|
380
|
-
# surely there's a better way to tell if
|
381
|
-
# an encoding is a 'Unicode encoding form'
|
382
|
-
# than this? What's wrong with you ruby 1.9?
|
383
|
-
str.encoding.name.start_with?('UTF') ?
|
384
|
-
"\uFFFD" :
|
385
|
-
"?" )
|
390
|
+
str = str.force_encoding(params[:external_encoding])
|
386
391
|
end
|
387
|
-
end
|
388
|
-
|
389
|
-
|
390
|
-
|
392
|
+
end
|
393
|
+
|
394
|
+
# If we're transcoding anyway, pass our invalid/replace options
|
395
|
+
# on to String#encode, which will take care of them -- or raise
|
396
|
+
# with illegal bytes without :replace=>:invalid.
|
397
|
+
#
|
398
|
+
# If we're NOT transcoding, we need to use our own pure-ruby
|
399
|
+
# implementation to do invalid byte replacements. OR to raise
|
400
|
+
# a predicatable exception iff :validate_encoding, otherwise
|
401
|
+
# for performance we won't check, and you may or may not
|
402
|
+
# get an exception from inside ruby-marc, and it may change
|
403
|
+
# in future implementations.
|
404
|
+
if params[:internal_encoding]
|
405
|
+
str = str.encode(params[:internal_encoding], params)
|
406
|
+
elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
|
407
|
+
str = EnsureValidEncoding.ensure_valid_encoding(str, params)
|
408
|
+
end
|
409
|
+
end
|
410
|
+
return str
|
411
|
+
end
|
391
412
|
end
|
392
413
|
|
393
414
|
|