marc 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3c354c92026e4cf40c482ef72de120b7aaffac5cbc24e1c369a498bf4519a6ff
4
- data.tar.gz: b6daa3e964746945d35cae76eb290cbc0b3dc2e5d03eb9d383e5e5aba473c676
3
+ metadata.gz: 474c3ee37225584b3e5f189ff5f49507b82741da82aef0dd68a7e39180d25874
4
+ data.tar.gz: f0d272c5171827dcfa327ae8d079ee0ada52ba8da5b981378f7c2352d16a7a0e
5
5
  SHA512:
6
- metadata.gz: b49c9c88fb12854317d0f2fc14ce465f261a9ba2db3941f694e690f124063bd905522d9a21e862e777d84992e858839956bd289a275e731be60f638e2b327e89
7
- data.tar.gz: 22be396099b50aa7dea829b10485e9f4dc0d7d9232470d1db05b56f95815f591f3eebdfdc44af90c6f8f212ddd982814acd5356de8cd9c88e194d54c93f9ba99
6
+ metadata.gz: 04361e464361334b874b737e292e58acae879c3d086a68d5292abb5b8ccd9c28370173c3a60ca6d5700ef8dbd941b607a888ef9c7c700623d4440f577423217d
7
+ data.tar.gz: 22162879382120991f8a76484c2de73ac5f2aa8daf163bb1de0977723102876e0c2f2a81b9e04d2f4c5beda189e4828edc4673c37bf8f02e3a4a1235a2abaf16
@@ -12,12 +12,12 @@ module MARC
12
12
  # http://www.loc.gov/marc/specifications/speccharmarc8.html
13
13
  #
14
14
  # NOT thread-safe, it needs to keep state as it goes through a string,
15
- # do not re-use between threads.
15
+ # do not re-use between threads.
16
16
  #
17
- # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
17
+ # Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
18
18
  #
19
19
  # Returns UTF-8 encoded string! Encode to something else if you want
20
- # something else.
20
+ # something else.
21
21
  #
22
22
  # III proprietary code points?
23
23
  class ToUnicode
@@ -31,7 +31,7 @@ module MARC
31
31
 
32
32
  # These are state flags, MARC8 requires you to keep
33
33
  # track of 'current char sets' or something like that, which
34
- # are changed with escape codes, or something like that.
34
+ # are changed with escape codes, or something like that.
35
35
  attr_accessor :g0, :g1
36
36
 
37
37
  def initialize
@@ -39,21 +39,21 @@ module MARC
39
39
  self.g1 = ANSEL
40
40
  end
41
41
 
42
- # Returns UTF-8 encoded string equivalent of marc8_string passed in.
42
+ # Returns UTF-8 encoded string equivalent of marc8_string passed in.
43
43
  #
44
44
  # Bad Marc8 bytes? By default will raise an Encoding::InvalidByteSequenceError
45
45
  # (will not have full metadata filled out, but will have a decent error message)
46
46
  #
47
47
  # Set option :invalid => :replace to instead silently replace bad bytes
48
- # with a replacement char -- by default Unicode Replacement Char, but can set
49
- # option :replace to something else, including empty string.
48
+ # with a replacement char -- by default Unicode Replacement Char, but can set
49
+ # option :replace to something else, including empty string.
50
50
  #
51
51
  # converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
52
52
  #
53
53
  # By default returns NFC normalized, but set :normalization option to:
54
54
  # :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
55
55
  # we won't do any normalization just take it as it comes out of the
56
- # transcode algorithm. This will generally NOT be composed.
56
+ # transcode algorithm. This will generally NOT be composed.
57
57
  #
58
58
  # By default, escaped unicode 'named character references' in Marc8 will
59
59
  # be translated to actual UTF8. Eg. "‏" But pass :expand_ncr => false
@@ -61,21 +61,21 @@ module MARC
61
61
  #
62
62
  # String arg passed in WILL have it's encoding tagged 'binary' if
63
63
  # it's not already, if it's Marc8 there's no good reason for it not to
64
- # be already.
64
+ # be already.
65
65
  def transcode(marc8_string, options = {})
66
66
  invalid_replacement = options.fetch(:replace, "\uFFFD")
67
67
  expand_ncr = options.fetch(:expand_ncr, true)
68
68
  normalization = options.fetch(:normalization, :nfc)
69
69
 
70
-
70
+
71
71
  # don't choke on empty marc8_string
72
72
  return "" if marc8_string.nil? || marc8_string.empty?
73
-
73
+
74
74
  # Make sure to call it 'binary', so we can slice it
75
75
  # byte by byte, and so ruby doesn't complain about bad
76
76
  # bytes for some other encoding. Yeah, we're changing
77
77
  # encoding on input! If it's Marc8, it ought to be tagged
78
- # binary already.
78
+ # binary already.
79
79
  marc8_string.force_encoding("binary")
80
80
 
81
81
  uni_list = []
@@ -124,7 +124,7 @@ module MARC
124
124
  end
125
125
 
126
126
  mb_flag = is_multibyte(self.g0)
127
-
127
+
128
128
  if mb_flag
129
129
  code_point = (marc8_string[pos].ord * 65536 +
130
130
  marc8_string[pos+1].ord * 256 +
@@ -134,7 +134,7 @@ module MARC
134
134
  code_point = marc8_string[pos].ord
135
135
  pos += 1
136
136
  end
137
-
137
+
138
138
  if (code_point < 0x20 or
139
139
  (code_point > 0x80 and code_point < 0xa0))
140
140
  uni = unichr(code_point)
@@ -144,7 +144,7 @@ module MARC
144
144
  begin
145
145
  code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
146
146
  (uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
147
-
147
+
148
148
  if cflag
149
149
  combinings.push unichr(uni)
150
150
  else
@@ -160,16 +160,16 @@ module MARC
160
160
  uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
161
161
  pos += 1
162
162
  else
163
- raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
163
+ raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
164
164
  end
165
165
  end
166
166
  end
167
167
 
168
168
  # what to do if combining chars left over?
169
169
  uni_str = uni_list.join('')
170
-
170
+
171
171
  if expand_ncr
172
- uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
172
+ uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
173
173
  [$1.hex].pack("U")
174
174
  end
175
175
  end
@@ -177,7 +177,7 @@ module MARC
177
177
  if normalization
178
178
  uni_str = UNF::Normalizer.normalize(uni_str, normalization)
179
179
  end
180
-
180
+
181
181
  return uni_str
182
182
  end
183
183
 
@@ -188,11 +188,11 @@ module MARC
188
188
  end
189
189
 
190
190
  # input single unicode codepoint as integer; output encoded as a UTF-8 string
191
- # python has unichr built-in, we just define it for convenience no problem.
191
+ # python has unichr built-in, we just define it for convenience no problem.
192
192
  def unichr(code_point)
193
193
  [code_point].pack("U")
194
194
  end
195
195
 
196
196
  end
197
197
  end
198
- end
198
+ end
@@ -1,3 +1,3 @@
1
1
  module MARC
2
- VERSION = "1.0.3"
2
+ VERSION = "1.0.4"
3
3
  end
@@ -32,9 +32,9 @@ if "".respond_to?(:encoding)
32
32
 
33
33
  def test_lots_of_marc8_test_cases
34
34
  # Heap of test cases taken from pymarc, which provided these
35
- # two data files, marc8 and utf8, with line-by-line correspondences.
35
+ # two data files, marc8 and utf8, with line-by-line correspondences.
36
36
  #
37
- # For now, we have NOT included proprietary III encodings in our test data!
37
+ # For now, we have NOT included proprietary III encodings in our test data!
38
38
  utf8_file = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
39
39
  marc8_file = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
40
40
 
@@ -55,7 +55,7 @@ if "".respond_to?(:encoding)
55
55
 
56
56
  assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
57
57
  end
58
- rescue EOFError => each
58
+ rescue EOFError => each
59
59
  # just means the file was over, no biggie
60
60
  assert i > 1500, "Read as many lines as we expected to, at least 1500"
61
61
  rescue Exception => e
@@ -82,27 +82,50 @@ if "".respond_to?(:encoding)
82
82
  assert_equal unicode_d, converter.transcode(marc8, :normalization => :nfd)
83
83
  assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
84
84
 
85
- # disable normalization for performance or something, we won't end up with NFC.
85
+ # disable normalization for performance or something, we won't end up with NFC.
86
86
  refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
87
87
  end
88
88
 
89
89
  def test_expand_ncr
90
90
  converter = MARC::Marc8::ToUnicode.new
91
-
91
+
92
92
  marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
93
93
  assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
94
94
  assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
95
- end
95
+ end
96
96
 
97
97
  def test_bad_byte
98
98
  converter = MARC::Marc8::ToUnicode.new
99
99
 
100
100
  bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
101
101
  assert_raise(Encoding::InvalidByteSequenceError) {
102
- value = converter.transcode(bad_marc8)
102
+ converter.transcode(bad_marc8)
103
103
  }
104
104
  end
105
105
 
106
+ def test_bad_byte_error_message
107
+ converter = MARC::Marc8::ToUnicode.new
108
+
109
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
110
+ begin
111
+ converter.transcode(bad_marc8)
112
+ rescue Encoding::InvalidByteSequenceError => err
113
+ assert_equal("MARC8, input byte offset 30, code set: 0x31, code point: 0x7b3639, value: 米国の統治の仕組�", err.message)
114
+ end
115
+ end
116
+
117
+ def test_multiple_bad_byte_error_message
118
+ converter = MARC::Marc8::ToUnicode.new
119
+
120
+ bad_marc8 = "\e$1!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
121
+ begin
122
+ converter.transcode(bad_marc8)
123
+ rescue Encoding::InvalidByteSequenceError => err
124
+ # It still identifies the first bad byte found in the offset info, but replaces all bad bytes in the error message
125
+ assert_equal("MARC8, input byte offset 21, code set: 0x31, code point: 0x7b3639, value: 統治の仕組� 米国の統治の仕組� 米国の統治の仕組�", err.message)
126
+ end
127
+ end
128
+
106
129
  def test_bad_byte_with_replacement
107
130
  converter = MARC::Marc8::ToUnicode.new
108
131
 
@@ -112,9 +135,9 @@ if "".respond_to?(:encoding)
112
135
  assert_equal "UTF-8", value.encoding.name
113
136
  assert value.valid_encoding?
114
137
 
115
- assert value.include?("\uFFFD"), "includes replacement char"
138
+ assert value.include?("\uFFFD"), "includes replacement char"
116
139
  # coalescing multiple replacement chars at end, could change
117
- # to not do so, important thing is at least one is there.
140
+ # to not do so, important thing is at least one is there.
118
141
  assert_equal "米国の統治の仕組�", value
119
142
  end
120
143
 
@@ -150,5 +173,5 @@ if "".respond_to?(:encoding)
150
173
  end
151
174
  else
152
175
  require 'pathname'
153
- $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
176
+ $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
154
177
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: marc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.3
4
+ version: 1.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin Clarke
@@ -13,7 +13,7 @@ authors:
13
13
  autorequire: marc
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2019-03-27 00:00:00.000000000 Z
16
+ date: 2019-06-28 00:00:00.000000000 Z
17
17
  dependencies:
18
18
  - !ruby/object:Gem::Dependency
19
19
  name: scrub_rb
@@ -131,8 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
131
131
  - !ruby/object:Gem::Version
132
132
  version: '0'
133
133
  requirements: []
134
- rubyforge_project:
135
- rubygems_version: 2.7.6
134
+ rubygems_version: 3.0.3
136
135
  signing_key:
137
136
  specification_version: 4
138
137
  summary: A ruby library for working with Machine Readable Cataloging