marc 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3c354c92026e4cf40c482ef72de120b7aaffac5cbc24e1c369a498bf4519a6ff
4
- data.tar.gz: b6daa3e964746945d35cae76eb290cbc0b3dc2e5d03eb9d383e5e5aba473c676
3
+ metadata.gz: 474c3ee37225584b3e5f189ff5f49507b82741da82aef0dd68a7e39180d25874
4
+ data.tar.gz: f0d272c5171827dcfa327ae8d079ee0ada52ba8da5b981378f7c2352d16a7a0e
5
5
  SHA512:
6
- metadata.gz: b49c9c88fb12854317d0f2fc14ce465f261a9ba2db3941f694e690f124063bd905522d9a21e862e777d84992e858839956bd289a275e731be60f638e2b327e89
7
- data.tar.gz: 22be396099b50aa7dea829b10485e9f4dc0d7d9232470d1db05b56f95815f591f3eebdfdc44af90c6f8f212ddd982814acd5356de8cd9c88e194d54c93f9ba99
6
+ metadata.gz: 04361e464361334b874b737e292e58acae879c3d086a68d5292abb5b8ccd9c28370173c3a60ca6d5700ef8dbd941b607a888ef9c7c700623d4440f577423217d
7
+ data.tar.gz: 22162879382120991f8a76484c2de73ac5f2aa8daf163bb1de0977723102876e0c2f2a81b9e04d2f4c5beda189e4828edc4673c37bf8f02e3a4a1235a2abaf16
@@ -12,12 +12,12 @@ module MARC
12
12
  # http://www.loc.gov/marc/specifications/speccharmarc8.html
13
13
  #
14
14
  # NOT thread-safe, it needs to keep state as it goes through a string,
15
- # do not re-use between threads.
15
+ # do not re-use between threads.
16
16
  #
17
- # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
17
+ # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
18
18
  #
19
19
  # Returns UTF-8 encoded string! Encode to something else if you want
20
- # something else.
20
+ # something else.
21
21
  #
22
22
  # III proprietary code points?
23
23
  class ToUnicode
@@ -31,7 +31,7 @@ module MARC
31
31
 
32
32
  # These are state flags, MARC8 requires you to keep
33
33
  # track of 'current char sets' or something like that, which
34
- # are changed with escape codes, or something like that.
34
+ # are changed with escape codes, or something like that.
35
35
  attr_accessor :g0, :g1
36
36
 
37
37
  def initialize
@@ -39,21 +39,21 @@ module MARC
39
39
  self.g1 = ANSEL
40
40
  end
41
41
 
42
- # Returns UTF-8 encoded string equivalent of marc8_string passed in.
42
+ # Returns UTF-8 encoded string equivalent of marc8_string passed in.
43
43
  #
44
44
  # Bad Marc8 bytes? By default will raise an Encoding::InvalidByteSequenceError
45
45
  # (will not have full metadata filled out, but will have a decent error message)
46
46
  #
47
47
  # Set option :invalid => :replace to instead silently replace bad bytes
48
- # with a replacement char -- by default Unicode Replacement Char, but can set
49
- # option :replace to something else, including empty string.
48
+ # with a replacement char -- by default Unicode Replacement Char, but can set
49
+ # option :replace to something else, including empty string.
50
50
  #
51
51
  # converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
52
52
  #
53
53
  # By default returns NFC normalized, but set :normalization option to:
54
54
  # :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
55
55
  # we won't do any normalization just take it as it comes out of the
56
- # transcode algorithm. This will generally NOT be composed.
56
+ # transcode algorithm. This will generally NOT be composed.
57
57
  #
58
58
  # By default, escaped unicode 'named character references' in Marc8 will
59
59
  # be translated to actual UTF8. Eg. "‏" But pass :expand_ncr => false
@@ -61,21 +61,21 @@ module MARC
61
61
  #
62
62
  # String arg passed in WILL have it's encoding tagged 'binary' if
63
63
  # it's not already, if it's Marc8 there's no good reason for it not to
64
- # be already.
64
+ # be already.
65
65
  def transcode(marc8_string, options = {})
66
66
  invalid_replacement = options.fetch(:replace, "\uFFFD")
67
67
  expand_ncr = options.fetch(:expand_ncr, true)
68
68
  normalization = options.fetch(:normalization, :nfc)
69
69
 
70
-
70
+
71
71
  # don't choke on empty marc8_string
72
72
  return "" if marc8_string.nil? || marc8_string.empty?
73
-
73
+
74
74
  # Make sure to call it 'binary', so we can slice it
75
75
  # byte by byte, and so ruby doesn't complain about bad
76
76
  # bytes for some other encoding. Yeah, we're changing
77
77
  # encoding on input! If it's Marc8, it ought to be tagged
78
- # binary already.
78
+ # binary already.
79
79
  marc8_string.force_encoding("binary")
80
80
 
81
81
  uni_list = []
@@ -124,7 +124,7 @@ module MARC
124
124
  end
125
125
 
126
126
  mb_flag = is_multibyte(self.g0)
127
-
127
+
128
128
  if mb_flag
129
129
  code_point = (marc8_string[pos].ord * 65536 +
130
130
  marc8_string[pos+1].ord * 256 +
@@ -134,7 +134,7 @@ module MARC
134
134
  code_point = marc8_string[pos].ord
135
135
  pos += 1
136
136
  end
137
-
137
+
138
138
  if (code_point < 0x20 or
139
139
  (code_point > 0x80 and code_point < 0xa0))
140
140
  uni = unichr(code_point)
@@ -144,7 +144,7 @@ module MARC
144
144
  begin
145
145
  code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
146
  (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
147
-
147
+
148
148
  if cflag
149
149
  combinings.push unichr(uni)
150
150
  else
@@ -160,16 +160,16 @@ module MARC
160
160
  uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
161
  pos += 1
162
162
  else
163
- raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
163
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
164
164
  end
165
165
  end
166
166
  end
167
167
 
168
168
  # what to do if combining chars left over?
169
169
  uni_str = uni_list.join('')
170
-
170
+
171
171
  if expand_ncr
172
- uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
172
+ uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
173
173
  [$1.hex].pack("U")
174
174
  end
175
175
  end
@@ -177,7 +177,7 @@ module MARC
177
177
  if normalization
178
178
  uni_str = UNF::Normalizer.normalize(uni_str, normalization)
179
179
  end
180
-
180
+
181
181
  return uni_str
182
182
  end
183
183
 
@@ -188,11 +188,11 @@ module MARC
188
188
  end
189
189
 
190
190
  # input single unicode codepoint as integer; output encoded as a UTF-8 string
191
- # python has unichr built-in, we just define it for convenience no problem.
191
+ # python has unichr built-in, we just define it for convenience no problem.
192
192
  def unichr(code_point)
193
193
  [code_point].pack("U")
194
194
  end
195
195
 
196
196
  end
197
197
  end
198
- end
198
+ end
@@ -1,3 +1,3 @@
1
1
  module MARC
2
- VERSION = "1.0.3"
2
+ VERSION = "1.0.4"
3
3
  end
@@ -32,9 +32,9 @@ if "".respond_to?(:encoding)
32
32
 
33
33
  def test_lots_of_marc8_test_cases
34
34
  # Heap of test cases taken from pymarc, which provided these
35
- # two data files, marc8 and utf8, with line-by-line correspondences.
35
+ # two data files, marc8 and utf8, with line-by-line correspondences.
36
36
  #
37
- # For now, we have NOT included proprietary III encodings in our test data!
37
+ # For now, we have NOT included proprietary III encodings in our test data!
38
38
  utf8_file = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
39
39
  marc8_file = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
40
40
 
@@ -55,7 +55,7 @@ if "".respond_to?(:encoding)
55
55
 
56
56
  assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
57
57
  end
58
- rescue EOFError => each
58
+ rescue EOFError => each
59
59
  # just means the file was over, no biggie
60
60
  assert i > 1500, "Read as many lines as we expected to, at least 1500"
61
61
  rescue Exception => e
@@ -82,27 +82,50 @@ if "".respond_to?(:encoding)
82
82
  assert_equal unicode_d, converter.transcode(marc8, :normalization => :nfd)
83
83
  assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
84
84
 
85
- # disable normalization for performance or something, we won't end up with NFC.
85
+ # disable normalization for performance or something, we won't end up with NFC.
86
86
  refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
87
87
  end
88
88
 
89
89
  def test_expand_ncr
90
90
  converter = MARC::Marc8::ToUnicode.new
91
-
91
+
92
92
  marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
93
93
  assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
94
94
  assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
95
- end
95
+ end
96
96
 
97
97
  def test_bad_byte
98
98
  converter = MARC::Marc8::ToUnicode.new
99
99
 
100
100
  bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
101
101
  assert_raise(Encoding::InvalidByteSequenceError) {
102
- value = converter.transcode(bad_marc8)
102
+ converter.transcode(bad_marc8)
103
103
  }
104
104
  end
105
105
 
106
+ def test_bad_byte_error_message
107
+ converter = MARC::Marc8::ToUnicode.new
108
+
109
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
110
+ begin
111
+ converter.transcode(bad_marc8)
112
+ rescue Encoding::InvalidByteSequenceError => err
113
+ assert_equal("MARC8, input byte offset 30, code set: 0x31, code point: 0x7b3639, value: 米国の統治の仕組�", err.message)
114
+ end
115
+ end
116
+
117
+ def test_multiple_bad_byte_error_message
118
+ converter = MARC::Marc8::ToUnicode.new
119
+
120
+ bad_marc8 = "\e$1!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
121
+ begin
122
+ converter.transcode(bad_marc8)
123
+ rescue Encoding::InvalidByteSequenceError => err
124
+ # It still identifies the first bad byte found in the offset info, but replaces all bad bytes in the error message
125
+ assert_equal("MARC8, input byte offset 21, code set: 0x31, code point: 0x7b3639, value: 統治の仕組� 米国の統治の仕組� 米国の統治の仕組�", err.message)
126
+ end
127
+ end
128
+
106
129
  def test_bad_byte_with_replacement
107
130
  converter = MARC::Marc8::ToUnicode.new
108
131
 
@@ -112,9 +135,9 @@ if "".respond_to?(:encoding)
112
135
  assert_equal "UTF-8", value.encoding.name
113
136
  assert value.valid_encoding?
114
137
 
115
- assert value.include?("\uFFFD"), "includes replacement char"
138
+ assert value.include?("\uFFFD"), "includes replacement char"
116
139
  # coalescing multiple replacement chars at end, could change
117
- # to not do so, important thing is at least one is there.
140
+ # to not do so, important thing is at least one is there.
118
141
  assert_equal "米国の統治の仕組�", value
119
142
  end
120
143
 
@@ -150,5 +173,5 @@ if "".respond_to?(:encoding)
150
173
  end
151
174
  else
152
175
  require 'pathname'
153
- $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
176
+ $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
154
177
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: marc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin Clarke
@@ -13,7 +13,7 @@ authors:
13
13
  autorequire: marc
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2019-03-27 00:00:00.000000000 Z
16
+ date: 2019-06-28 00:00:00.000000000 Z
17
17
  dependencies:
18
18
  - !ruby/object:Gem::Dependency
19
19
  name: scrub_rb
@@ -131,8 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
131
131
  - !ruby/object:Gem::Version
132
132
  version: '0'
133
133
  requirements: []
134
- rubyforge_project:
135
- rubygems_version: 2.7.6
134
+ rubygems_version: 3.0.3
136
135
  signing_key:
137
136
  specification_version: 4
138
137
  summary: A ruby library for working with Machine Readable Cataloging