marc 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,198 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'marc'
4
+ require 'marc/marc8/map_to_unicode'
5
+ require 'unf/normalizer'
6
+
7
+ module MARC
8
+ module Marc8
9
+ # Class to convert Marc8 to UTF-8. NOTE: Requires ruby 1.9+ (this could be
10
+ # changed without too much trouble, but we just don't care to support 1.8.7 anymore.)
11
+ #
12
+ # http://www.loc.gov/marc/specifications/speccharmarc8.html
13
+ #
14
+ # NOT thread-safe, it needs to keep state as it goes through a string,
15
+ # do not re-use between threads.
16
+ #
17
+ # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
18
+ #
19
+ # Returns UTF-8 encoded string! Encode to something else if you want
20
+ # something else.
21
+ #
22
+ # III proprietary code points?
23
+ class ToUnicode
24
+ BASIC_LATIN = 0x42
25
+ ANSEL = 0x45
26
+
27
+ G0_SET = ['(', ',', '$']
28
+ G1_SET = [')', '-', '$']
29
+
30
+ CODESETS = MARC::Marc8::MapToUnicode::CODESETS
31
+
32
+ # These are state flags, MARC8 requires you to keep
33
+ # track of 'current char sets' or something like that, which
34
+ # are changed with escape codes, or something like that.
35
+ attr_accessor :g0, :g1
36
+
37
+ def initialize
38
+ self.g0 = BASIC_LATIN
39
+ self.g1 = ANSEL
40
+ end
41
+
42
+ # Returns UTF-8 encoded string equivalent of marc8_string passed in.
43
+ #
44
+ # Bad Marc8 bytes? By default will raise an Encoding::InvalidByteSequenceError
45
+ # (will not have full metadata filled out, but will have a decent error message)
46
+ #
47
+ # Set option :invalid => :replace to instead silently replace bad bytes
48
+ # with a replacement char -- by default Unicode Replacement Char, but can set
49
+ # option :replace to something else, including empty string.
50
+ #
51
+ # converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
52
+ #
53
+ # By default returns NFC normalized, but set :normalization option to:
54
+ # :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
55
+ # we won't do any normalization just take it as it comes out of the
56
+ # transcode algorithm. This will generally NOT be composed.
57
+ #
58
+ # By default, escaped unicode 'named character references' in Marc8 will
59
+ # be translated to actual UTF8. Eg. "‏" But pass :expand_ncr => false
60
+ # to disable. http://www.loc.gov/marc/specifications/speccharconversion.html#lossless
61
+ #
62
+ # String arg passed in WILL have it's encoding tagged 'binary' if
63
+ # it's not already, if it's Marc8 there's no good reason for it not to
64
+ # be already.
65
+ def transcode(marc8_string, options = {})
66
+ invalid_replacement = options.fetch(:replace, "\uFFFD")
67
+ expand_ncr = options.fetch(:expand_ncr, true)
68
+ normalization = options.fetch(:normalization, :nfc)
69
+
70
+
71
+ # don't choke on empty marc8_string
72
+ return "" if marc8_string.nil? || marc8_string.empty?
73
+
74
+ # Make sure to call it 'binary', so we can slice it
75
+ # byte by byte, and so ruby doesn't complain about bad
76
+ # bytes for some other encoding. Yeah, we're changing
77
+ # encoding on input! If it's Marc8, it ought to be tagged
78
+ # binary already.
79
+ marc8_string.force_encoding("binary")
80
+
81
+ uni_list = []
82
+ combinings = []
83
+ pos = 0
84
+ while pos < marc8_string.length
85
+ if marc8_string[pos] == "\x1b"
86
+ next_byte = marc8_string[pos+1]
87
+ if G0_SET.include? next_byte
88
+ if marc8_string.length >= pos + 3
89
+ if marc8_string[pos+2] == ',' and next_byte == '$'
90
+ pos += 1
91
+ end
92
+ self.g0 = marc8_string[pos+2].ord
93
+ pos = pos + 3
94
+ next
95
+ else
96
+ # if there aren't enough remaining characters, readd
97
+ # the escape character so it doesn't get lost; may
98
+ # help users diagnose problem records
99
+ uni_list.push marc8_string[pos]
100
+ pos += 1
101
+ next
102
+ end
103
+
104
+ elsif G1_SET.include? next_byte
105
+ if marc8_string[pos+2] == '-' and next_byte == '$'
106
+ pos += 1
107
+ end
108
+ self.g1 = marc8_string[pos+2].ord
109
+ pos = pos + 3
110
+ next
111
+ else
112
+ charset = next_byte.ord
113
+ if CODESETS.has_key? charset
114
+ self.g0 = charset
115
+ pos += 2
116
+ elsif charset == 0x73
117
+ self.g0 = BASIC_LATIN
118
+ pos += 2
119
+ if pos == marc8_string.length
120
+ break
121
+ end
122
+ end
123
+ end
124
+ end
125
+
126
+ mb_flag = is_multibyte(self.g0)
127
+
128
+ if mb_flag
129
+ code_point = (marc8_string[pos].ord * 65536 +
130
+ marc8_string[pos+1].ord * 256 +
131
+ marc8_string[pos+2].ord)
132
+ pos += 3
133
+ else
134
+ code_point = marc8_string[pos].ord
135
+ pos += 1
136
+ end
137
+
138
+ if (code_point < 0x20 or
139
+ (code_point > 0x80 and code_point < 0xa0))
140
+ uni = unichr(code_point)
141
+ next
142
+ end
143
+
144
+ begin
145
+ code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
+ (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
147
+
148
+ if cflag
149
+ combinings.push unichr(uni)
150
+ else
151
+ uni_list.push unichr(uni)
152
+ if combinings.length > 0
153
+ uni_list.concat combinings
154
+ combinings = []
155
+ end
156
+ end
157
+ rescue KeyError
158
+ if options[:invalid] == :replace
159
+ # Let's coallesece multiple replacements
160
+ uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
+ pos += 1
162
+ else
163
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
164
+ end
165
+ end
166
+ end
167
+
168
+ # what to do if combining chars left over?
169
+ uni_str = uni_list.join('')
170
+
171
+ if expand_ncr
172
+ uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
173
+ [$1.hex].pack("U")
174
+ end
175
+ end
176
+
177
+ if normalization
178
+ uni_str = UNF::Normalizer.normalize(uni_str, normalization)
179
+ end
180
+
181
+ return uni_str
182
+ end
183
+
184
+ # from the original python, yeah, apparently
185
+ # only one charset is considered multibyte
186
+ def is_multibyte(charset)
187
+ charset == 0x31
188
+ end
189
+
190
+ # input single unicode codepoint as integer; output encoded as a UTF-8 string
191
+ # python has unichr built-in, we just define it for convenience no problem.
192
+ def unichr(code_point)
193
+ [code_point].pack("U")
194
+ end
195
+
196
+ end
197
+ end
198
+ end
data/lib/marc/reader.rb CHANGED
@@ -1,35 +1,48 @@
1
+ require 'ensure_valid_encoding'
2
+ require 'marc/marc8/to_unicode'
3
+
1
4
  module MARC
2
5
  # A class for reading MARC binary (ISO 2709) files.
3
6
  #
4
7
  # == Character Encoding
5
8
  #
6
- # In ruby 1.8, if you mess up your character encodings, you may get
7
- # garbage bytes. MARC::Reader takes no special action to determine or
8
- # correct character encodings in ruby 1.8.
9
- #
10
- # In ruby 1.9, if character encodings get confused, you will likely get an
11
- # exception raised at some point, either from inside MARC::Reader or in your
12
- # own code. If your marc records are not in UTF-8, you will have to make sure
13
- # MARC::Reader knows what character encoding to expect. For UTF-8, normally
14
- # it will just work.
15
- #
16
- # Note that if your source data includes invalid illegal characters
17
- # for it's encoding, while it _may_ not cause MARC::Reader to raise an
18
- # exception, it will likely result in an exception at a later point in
19
- # your own code. You can ask MARC::Reader to remove invalid bytes from data,
20
- # see :invalid and :replace options below.
21
- #
22
- # In ruby 1.9, it's important strings are tagged with their proper encoding.
23
- # **MARC::Reader does _not_ at present look inside the MARC file to see what
24
- # encoding it claims for itself** -- real world MARC records are so unreliable
25
- # here as to limit utility; and we have international users and international
26
- # MARC uses several conventions for this. Instead, MARC::Reader uses ordinary
27
- # ruby conventions. If your data is in UTF-8, it'll probably Just Work,
28
- # otherwise you simply have to tell MARC::Reader what the source encoding is:
29
- #
30
- # Encoding.default_external # => usually "UTF-8" for most people
31
- # # marc data will be considered UTF-8, as per Encoding.default_external
32
- # MARC::Reader.new("path/to/file.marc")
9
+ # In ruby 1.9+, ruby tags all strings with expected character encodings.
10
+ # If illegal bytes for that character encoding are encountered in certain
11
+ # operations, ruby will raise an exception. If a String is incorrectly
12
+ # tagged with the wrong character encoding, that makes it fairly likely
13
+ # an illegal byte for the specified encoding will be encountered.
14
+ #
15
+ # So when reading binary MARC data with the MARC::Reader, it's important
16
+ # that you let it know the expected encoding:
17
+ #
18
+ # MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8")
19
+ #
20
+ # If you leave off 'external_encoding', it will use the ruby environment
21
+ # Encoding.default_external, which is usually UTF-8 but may depend on your
22
+ # environment.
23
+ #
24
+ # Even if you expect your data to be (eg) UTF-8, it may include bad/illegal
25
+ # bytes. By default MARC::Reader will leave these in the produced Strings,
26
+ # which will probably raise an exception later in your program. Better
27
+ # to catch this early, and ask MARC::Reader to raise immediately on illegal
28
+ # bytes:
29
+ #
30
+ # MARC::Reader.new("path/to/file.mrc", :external_encoding => "UTF-8",
31
+ # :validate_encoding => true)
32
+ #
33
+ # Alternately, you can have MARC::Reader replace illegal bytes
34
+ # with the Unicode Replacement Character, or with a string
35
+ # of your choice (including the empty string, meaning just omit the bad bytes)
36
+ #
37
+ # MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
38
+ # :invalid => :replace)
39
+ # MARC::Reader("path/to/file.mrc", :external_encoding => "UTF-8",
40
+ # :invalid => :replace, :replace => "")
41
+ #
42
+ # If you supply an :external_encoding argument, MARC::Reader will
43
+ # always assume that encoding -- if you leave it off, MARC::Reader
44
+ # will use the encoding tagged on any input you pass in, such
45
+ # as Strings or File handles.
33
46
  #
34
47
  # # marc data will have same encoding as string.encoding:
35
48
  # MARC::Reader.decode( string )
@@ -44,17 +57,42 @@ module MARC
44
57
  # # explicitly tell MARC::Reader the encoding
45
58
  # MARC::Reader.new("myfile.marc", :external_encoding => "cp866")
46
59
  #
47
- # # If you have Marc8 data, you _really_ want to convert it
48
- # # to UTF8 outside of ruby, but if you can't:
49
- # MARC::Reader.new("marc8.marc" :external_encoding => "binary")
50
- # # But you probably _will_ have problems subsequently in your own
51
- # # own code using the MARC::Record.
60
+ # === MARC-8
61
+ #
62
+ # The legacy MARC-8 encoding needs to be handled differently, because
63
+ # there is no built-in support in ruby for MARC-8.
64
+ #
65
+ # You _can_ specify "MARC-8" as an external encoding. It will trigger
66
+ # trans-code to UTF-8 (NFC-normalized) in the internal ruby strings.
67
+ #
68
+ # MARC::Reader.new("marc8.mrc", :external_encoding => "MARC-8")
69
+ #
70
+ # For external_encoding "MARC-8", :validate_encoding is always true,
71
+ # there's no way to ignore bad bytes in MARC-8 when transcoding to
72
+ # unicode. However, just as with other encodings, the
73
+ # `:invalid => :replace` and `:replace => "string"`
74
+ # options can be used to replace bad bytes instead of raising.
75
+ #
76
+ # If you want your MARC-8 to be transcoded internally to something
77
+ # other than UTF-8, you can use the :internal_encoding option
78
+ # which works with any encoding in MARC::Reader.
79
+ #
80
+ # MARC::Reader.new("marc8.mrc",
81
+ # :external_encoding => "MARC-8",
82
+ # :internal_encoding => "UTF-16LE")
83
+ #
84
+ # If you want to read in MARC-8 without transcoding, leaving the
85
+ # internal Strings in MARC-8, the only way to do that is with
86
+ # ruby's 'binary' (aka "ASCII-8BIT") encoding, since ruby doesn't
87
+ # know from MARC-8. This will work:
88
+ #
89
+ # MARC::Reader.new("marc8.mrc", :external_encoding => "binary")
52
90
  #
53
- # One way or another, you have to tell MARC::Reader what the external
54
- # encoding is, if it's not the default for your system (usually UTF-8).
55
- # It won't guess from internal MARC leader etc.
91
+ # Please note that MARC::Reader does _not_ currently have any facilities
92
+ # for guessing encoding from MARC21 leader byte 9, that is ignored.
93
+ #
94
+ # === Complete Encoding Options
56
95
  #
57
- # == Additional Options
58
96
  # These options can all be used on MARC::Reader.new _or_ MARC::Reader.decode
59
97
  # to specify external encoding, ask for a transcode to a different
60
98
  # encoding on read, or validate or replace bad bytes in source.
@@ -83,7 +121,7 @@ module MARC
83
121
  # your own replacement string for invalid bytes. You may use the
84
122
  # empty string to simply eliminate invalid bytes.
85
123
  #
86
- # == Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
124
+ # === Warning on ruby File's own :internal_encoding, and unsafe transcoding from ruby
87
125
  #
88
126
  # Be careful with using an explicit File object with the File's own
89
127
  # :internal_encoding set -- it can cause ruby to transcode your data
@@ -109,11 +147,14 @@ module MARC
109
147
  # MARC::Reader.new( File.new("marc_in_cp866.mrc", "r:binary:binary"),
110
148
  # :external_encoding => "cp866",
111
149
  # :internal_encoding => "utf-8")
112
- # == jruby note
113
- # Note all of our char encoding tests currently pass on jruby in ruby 1.9
114
- # mode; if you are using binary MARC records in a non-UTF8 encoding, you may
115
- # have trouble in jruby. We believe it's a jruby bug.
150
+ #
151
+ # === jruby note
152
+ # In the past, jruby encoding-related bugs have caused problems with
153
+ # our encoding treatments. See for example:
116
154
  # https://jira.codehaus.org/browse/JRUBY-6637
155
+ #
156
+ # We recommend using the latest version of jruby, especially
157
+ # at least jruby 1.7.6.
117
158
  class Reader
118
159
  include Enumerable
119
160
 
@@ -284,31 +325,10 @@ module MARC
284
325
 
285
326
  # remove end of field
286
327
  field_data.delete!(END_OF_FIELD)
287
-
288
- if field_data.respond_to?(:force_encoding)
289
- if params[:external_encoding]
290
- field_data = field_data.force_encoding(params[:external_encoding])
291
- end
292
-
293
- # If we're transcoding anyway, pass our invalid/replace options
294
- # on to String#encode, which will take care of them -- or raise
295
- # with illegal bytes without :replace=>:invalid.
296
- #
297
- # If we're NOT transcoding, we need to use our own pure-ruby
298
- # implementation to do invalid byte replacements. OR to raise
299
- # a predicatable exception iff :validate_encoding, otherwise
300
- # for performance we won't check, and you may or may not
301
- # get an exception from inside ruby-marc, and it may change
302
- # in future implementations.
303
- if params[:internal_encoding]
304
- field_data = field_data.encode(params[:internal_encoding], params)
305
- elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
306
- field_data = MARC::Reader.validate_encoding(field_data, params)
307
- end
308
-
309
- end
328
+
310
329
  # add a control field or data field
311
330
  if MARC::ControlField.control_tag?(tag)
331
+ field_data = MARC::Reader.set_encoding( field_data , params)
312
332
  record.append(MARC::ControlField.new(tag,field_data))
313
333
  else
314
334
  field = MARC::DataField.new(tag)
@@ -321,12 +341,13 @@ module MARC
321
341
  next if subfields.length() < 2
322
342
 
323
343
  # get indicators
324
- indicators = subfields.shift()
344
+ indicators = MARC::Reader.set_encoding( subfields.shift(), params)
325
345
  field.indicator1 = indicators[0,1]
326
346
  field.indicator2 = indicators[1,1]
327
347
 
328
348
  # add each subfield to the field
329
349
  subfields.each() do |data|
350
+ data = MARC::Reader.set_encoding( data, params )
330
351
  subfield = MARC::Subfield.new(data[0,1],data[1..-1])
331
352
  field.append(subfield)
332
353
  end
@@ -337,57 +358,57 @@ module MARC
337
358
  end
338
359
 
339
360
  return record
340
- end
341
-
342
- # Pass in a string, will raise an Encoding::InvalidByteSequenceError
343
- # if it contains an invalid byte for it's encoding; otherwise
344
- # returns an equivalent string. Surprisingly not built into
345
- # ruby 1.9.3 (yet?). https://bugs.ruby-lang.org/issues/6321
346
- #
347
- # The InvalidByteSequenceError will NOT be filled out
348
- # with the usual error metadata, sorry.
361
+ end
362
+
363
+ # input passed in probably has 'binary' encoding.
364
+ # We'll set it to the proper encoding, and depending on settings, optionally
365
+ # * check for valid encoding
366
+ # * raise if not valid
367
+ # * or replace bad bytes with replacement chars if not valid
368
+ # * transcode from external_encoding to internal_encoding
349
369
  #
350
- # OR, like String#encode, pass in option `:invalid => :replace`
351
- # to replace invalid bytes with a replacement string in the
352
- # returned string. Pass in the
353
- # char you'd like with option `:replace`, or will, like String#encode
354
- # use the unicode replacement char if it thinks it's a unicode encoding,
355
- # else ascii '?'.
370
+ # Special case for encoding "MARC-8" -- will be transcoded to
371
+ # UTF-8 (then further transcoded to external_encoding, if set).
372
+ # For "MARC-8", validate_encoding is always true, there's no way to
373
+ # ignore bad bytes.
356
374
  #
357
- # in any case, method will raise, or return a new string
358
- # that is #valid_encoding?
359
- def self.validate_encoding(str, options = {})
360
- return str unless str.respond_to?(:encoding)
361
-
362
- if str.valid_encoding?
363
- return str
364
- elsif options[:invalid] != :replace
365
- # If we're not replacing, just raise right away without going through
366
- # chars for performance.
367
- #
368
- # That does mean we're not able to say exactly what byte was bad though.
369
- # And the exception isn't filled out with all it's usual attributes,
370
- # which would be hard even we were going through all the chars/bytes.
371
- raise Encoding::InvalidByteSequenceError.new("invalid byte in string for source encoding #{str.encoding.name}")
372
- else
373
- # :replace => :invalid,
374
- # actually need to go through chars to replace bad ones
375
- return str.chars.collect do |c|
376
- if c.valid_encoding?
377
- c
375
+ # Params options:
376
+ #
377
+ # * external_encoding: what encoding the input is expected to be in
378
+ # * validate_encoding: if true, will raise if an invalid encoding
379
+ # * invalid: if set to :replace, will replace bad bytes with replacement
380
+ # chars instead of raising.
381
+ # * replace: Set replacement char for use with 'invalid', otherwise defaults
382
+ # to unicode replacement char, or question mark.
383
+ def self.set_encoding(str, params)
384
+ if str.respond_to?(:force_encoding)
385
+ if params[:external_encoding]
386
+ if params[:external_encoding] == "MARC-8"
387
+ transcode_params = [:invalid, :replace].each_with_object({}) { |k, hash| hash[k] = params[k] if params.has_key?(k) }
388
+ str = MARC::Marc8::ToUnicode.new.transcode(str, transcode_params)
378
389
  else
379
- options[:replace] || (
380
- # surely there's a better way to tell if
381
- # an encoding is a 'Unicode encoding form'
382
- # than this? What's wrong with you ruby 1.9?
383
- str.encoding.name.start_with?('UTF') ?
384
- "\uFFFD" :
385
- "?" )
390
+ str = str.force_encoding(params[:external_encoding])
386
391
  end
387
- end.join
388
- end
389
- end
390
-
392
+ end
393
+
394
+ # If we're transcoding anyway, pass our invalid/replace options
395
+ # on to String#encode, which will take care of them -- or raise
396
+ # with illegal bytes without :replace=>:invalid.
397
+ #
398
+ # If we're NOT transcoding, we need to use our own pure-ruby
399
+ # implementation to do invalid byte replacements. OR to raise
400
+ # a predicatable exception iff :validate_encoding, otherwise
401
+ # for performance we won't check, and you may or may not
402
+ # get an exception from inside ruby-marc, and it may change
403
+ # in future implementations.
404
+ if params[:internal_encoding]
405
+ str = str.encode(params[:internal_encoding], params)
406
+ elsif (params[:invalid] || params[:replace] || (params[:validate_encoding] == true))
407
+ str = EnsureValidEncoding.ensure_valid_encoding(str, params)
408
+ end
409
+ end
410
+ return str
411
+ end
391
412
  end
392
413
 
393
414