marc 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/marc/marc8/to_unicode.rb +21 -21
- data/lib/marc/version.rb +1 -1
- data/test/marc8/tc_to_unicode.rb +33 -10
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 474c3ee37225584b3e5f189ff5f49507b82741da82aef0dd68a7e39180d25874
|
4
|
+
data.tar.gz: f0d272c5171827dcfa327ae8d079ee0ada52ba8da5b981378f7c2352d16a7a0e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04361e464361334b874b737e292e58acae879c3d086a68d5292abb5b8ccd9c28370173c3a60ca6d5700ef8dbd941b607a888ef9c7c700623d4440f577423217d
|
7
|
+
data.tar.gz: 22162879382120991f8a76484c2de73ac5f2aa8daf163bb1de0977723102876e0c2f2a81b9e04d2f4c5beda189e4828edc4673c37bf8f02e3a4a1235a2abaf16
|
@@ -12,12 +12,12 @@ module MARC
|
|
12
12
|
# http://www.loc.gov/marc/specifications/speccharmarc8.html
|
13
13
|
#
|
14
14
|
# NOT thread-safe, it needs to keep state as it goes through a string,
|
15
|
-
# do not re-use between threads.
|
15
|
+
# do not re-use between threads.
|
16
16
|
#
|
17
|
-
# Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
|
17
|
+
# Uses 4 spaces per indent, rather than usual ruby 2 space, just to change the python less.
|
18
18
|
#
|
19
19
|
# Returns UTF-8 encoded string! Encode to something else if you want
|
20
|
-
# something else.
|
20
|
+
# something else.
|
21
21
|
#
|
22
22
|
# III proprietary code points?
|
23
23
|
class ToUnicode
|
@@ -31,7 +31,7 @@ module MARC
|
|
31
31
|
|
32
32
|
# These are state flags, MARC8 requires you to keep
|
33
33
|
# track of 'current char sets' or something like that, which
|
34
|
-
# are changed with escape codes, or something like that.
|
34
|
+
# are changed with escape codes, or something like that.
|
35
35
|
attr_accessor :g0, :g1
|
36
36
|
|
37
37
|
def initialize
|
@@ -39,21 +39,21 @@ module MARC
|
|
39
39
|
self.g1 = ANSEL
|
40
40
|
end
|
41
41
|
|
42
|
-
# Returns UTF-8 encoded string equivalent of marc8_string passed in.
|
42
|
+
# Returns UTF-8 encoded string equivalent of marc8_string passed in.
|
43
43
|
#
|
44
44
|
# Bad Marc8 bytes? By default will raise an Encoding::InvalidByteSequenceError
|
45
45
|
# (will not have full metadata filled out, but will have a decent error message)
|
46
46
|
#
|
47
47
|
# Set option :invalid => :replace to instead silently replace bad bytes
|
48
|
-
# with a replacement char -- by default Unicode Replacement Char, but can set
|
49
|
-
# option :replace to something else, including empty string.
|
48
|
+
# with a replacement char -- by default Unicode Replacement Char, but can set
|
49
|
+
# option :replace to something else, including empty string.
|
50
50
|
#
|
51
51
|
# converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
|
52
52
|
#
|
53
53
|
# By default returns NFC normalized, but set :normalization option to:
|
54
54
|
# :nfd, :nfkd, :nfkc, :nfc, or nil. Set to nil for higher performance,
|
55
55
|
# we won't do any normalization just take it as it comes out of the
|
56
|
-
# transcode algorithm. This will generally NOT be composed.
|
56
|
+
# transcode algorithm. This will generally NOT be composed.
|
57
57
|
#
|
58
58
|
# By default, escaped unicode 'named character references' in Marc8 will
|
59
59
|
# be translated to actual UTF8. Eg. "‏" But pass :expand_ncr => false
|
@@ -61,21 +61,21 @@ module MARC
|
|
61
61
|
#
|
62
62
|
# String arg passed in WILL have it's encoding tagged 'binary' if
|
63
63
|
# it's not already, if it's Marc8 there's no good reason for it not to
|
64
|
-
# be already.
|
64
|
+
# be already.
|
65
65
|
def transcode(marc8_string, options = {})
|
66
66
|
invalid_replacement = options.fetch(:replace, "\uFFFD")
|
67
67
|
expand_ncr = options.fetch(:expand_ncr, true)
|
68
68
|
normalization = options.fetch(:normalization, :nfc)
|
69
69
|
|
70
|
-
|
70
|
+
|
71
71
|
# don't choke on empty marc8_string
|
72
72
|
return "" if marc8_string.nil? || marc8_string.empty?
|
73
|
-
|
73
|
+
|
74
74
|
# Make sure to call it 'binary', so we can slice it
|
75
75
|
# byte by byte, and so ruby doesn't complain about bad
|
76
76
|
# bytes for some other encoding. Yeah, we're changing
|
77
77
|
# encoding on input! If it's Marc8, it ought to be tagged
|
78
|
-
# binary already.
|
78
|
+
# binary already.
|
79
79
|
marc8_string.force_encoding("binary")
|
80
80
|
|
81
81
|
uni_list = []
|
@@ -124,7 +124,7 @@ module MARC
|
|
124
124
|
end
|
125
125
|
|
126
126
|
mb_flag = is_multibyte(self.g0)
|
127
|
-
|
127
|
+
|
128
128
|
if mb_flag
|
129
129
|
code_point = (marc8_string[pos].ord * 65536 +
|
130
130
|
marc8_string[pos+1].ord * 256 +
|
@@ -134,7 +134,7 @@ module MARC
|
|
134
134
|
code_point = marc8_string[pos].ord
|
135
135
|
pos += 1
|
136
136
|
end
|
137
|
-
|
137
|
+
|
138
138
|
if (code_point < 0x20 or
|
139
139
|
(code_point > 0x80 and code_point < 0xa0))
|
140
140
|
uni = unichr(code_point)
|
@@ -144,7 +144,7 @@ module MARC
|
|
144
144
|
begin
|
145
145
|
code_set = (code_point > 0x80 and not mb_flag) ? self.g1 : self.g0
|
146
146
|
(uni, cflag) = CODESETS.fetch(code_set).fetch(code_point)
|
147
|
-
|
147
|
+
|
148
148
|
if cflag
|
149
149
|
combinings.push unichr(uni)
|
150
150
|
else
|
@@ -160,16 +160,16 @@ module MARC
|
|
160
160
|
uni_list.push invalid_replacement unless uni_list.last == invalid_replacement
|
161
161
|
pos += 1
|
162
162
|
else
|
163
|
-
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}")
|
163
|
+
raise Encoding::InvalidByteSequenceError.new("MARC8, input byte offset #{pos}, code set: 0x#{code_set.to_s(16)}, code point: 0x#{code_point.to_s(16)}, value: #{transcode(marc8_string, :invalid => :replace, :replace => "�")}")
|
164
164
|
end
|
165
165
|
end
|
166
166
|
end
|
167
167
|
|
168
168
|
# what to do if combining chars left over?
|
169
169
|
uni_str = uni_list.join('')
|
170
|
-
|
170
|
+
|
171
171
|
if expand_ncr
|
172
|
-
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
172
|
+
uni_str.gsub!(/&#x([0-9A-F]{4,6});/) do
|
173
173
|
[$1.hex].pack("U")
|
174
174
|
end
|
175
175
|
end
|
@@ -177,7 +177,7 @@ module MARC
|
|
177
177
|
if normalization
|
178
178
|
uni_str = UNF::Normalizer.normalize(uni_str, normalization)
|
179
179
|
end
|
180
|
-
|
180
|
+
|
181
181
|
return uni_str
|
182
182
|
end
|
183
183
|
|
@@ -188,11 +188,11 @@ module MARC
|
|
188
188
|
end
|
189
189
|
|
190
190
|
# input single unicode codepoint as integer; output encoded as a UTF-8 string
|
191
|
-
# python has unichr built-in, we just define it for convenience no problem.
|
191
|
+
# python has unichr built-in, we just define it for convenience no problem.
|
192
192
|
def unichr(code_point)
|
193
193
|
[code_point].pack("U")
|
194
194
|
end
|
195
195
|
|
196
196
|
end
|
197
197
|
end
|
198
|
-
end
|
198
|
+
end
|
data/lib/marc/version.rb
CHANGED
data/test/marc8/tc_to_unicode.rb
CHANGED
@@ -32,9 +32,9 @@ if "".respond_to?(:encoding)
|
|
32
32
|
|
33
33
|
def test_lots_of_marc8_test_cases
|
34
34
|
# Heap of test cases taken from pymarc, which provided these
|
35
|
-
# two data files, marc8 and utf8, with line-by-line correspondences.
|
35
|
+
# two data files, marc8 and utf8, with line-by-line correspondences.
|
36
36
|
#
|
37
|
-
# For now, we have NOT included proprietary III encodings in our test data!
|
37
|
+
# For now, we have NOT included proprietary III encodings in our test data!
|
38
38
|
utf8_file = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
|
39
39
|
marc8_file = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
|
40
40
|
|
@@ -55,7 +55,7 @@ if "".respond_to?(:encoding)
|
|
55
55
|
|
56
56
|
assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
|
57
57
|
end
|
58
|
-
rescue EOFError => each
|
58
|
+
rescue EOFError => each
|
59
59
|
# just means the file was over, no biggie
|
60
60
|
assert i > 1500, "Read as many lines as we expected to, at least 1500"
|
61
61
|
rescue Exception => e
|
@@ -82,27 +82,50 @@ if "".respond_to?(:encoding)
|
|
82
82
|
assert_equal unicode_d, converter.transcode(marc8, :normalization => :nfd)
|
83
83
|
assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
|
84
84
|
|
85
|
-
# disable normalization for performance or something, we won't end up with NFC.
|
85
|
+
# disable normalization for performance or something, we won't end up with NFC.
|
86
86
|
refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
|
87
87
|
end
|
88
88
|
|
89
89
|
def test_expand_ncr
|
90
90
|
converter = MARC::Marc8::ToUnicode.new
|
91
|
-
|
91
|
+
|
92
92
|
marc8_ncr = "Weird ‏ � but these aren't changed #x2000; ÈF etc."
|
93
93
|
assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; ÈF etc.", converter.transcode(marc8_ncr)
|
94
94
|
assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
|
95
|
-
end
|
95
|
+
end
|
96
96
|
|
97
97
|
def test_bad_byte
|
98
98
|
converter = MARC::Marc8::ToUnicode.new
|
99
99
|
|
100
100
|
bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
|
101
101
|
assert_raise(Encoding::InvalidByteSequenceError) {
|
102
|
-
|
102
|
+
converter.transcode(bad_marc8)
|
103
103
|
}
|
104
104
|
end
|
105
105
|
|
106
|
+
def test_bad_byte_error_message
|
107
|
+
converter = MARC::Marc8::ToUnicode.new
|
108
|
+
|
109
|
+
bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
|
110
|
+
begin
|
111
|
+
converter.transcode(bad_marc8)
|
112
|
+
rescue Encoding::InvalidByteSequenceError => err
|
113
|
+
assert_equal("MARC8, input byte offset 30, code set: 0x31, code point: 0x7b3639, value: 米国の統治の仕組�", err.message)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_multiple_bad_byte_error_message
|
118
|
+
converter = MARC::Marc8::ToUnicode.new
|
119
|
+
|
120
|
+
bad_marc8 = "\e$1!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B \e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
|
121
|
+
begin
|
122
|
+
converter.transcode(bad_marc8)
|
123
|
+
rescue Encoding::InvalidByteSequenceError => err
|
124
|
+
# It still identifies the first bad byte found in the offset info, but replaces all bad bytes in the error message
|
125
|
+
assert_equal("MARC8, input byte offset 21, code set: 0x31, code point: 0x7b3639, value: 統治の仕組� 米国の統治の仕組� 米国の統治の仕組�", err.message)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
106
129
|
def test_bad_byte_with_replacement
|
107
130
|
converter = MARC::Marc8::ToUnicode.new
|
108
131
|
|
@@ -112,9 +135,9 @@ if "".respond_to?(:encoding)
|
|
112
135
|
assert_equal "UTF-8", value.encoding.name
|
113
136
|
assert value.valid_encoding?
|
114
137
|
|
115
|
-
assert value.include?("\uFFFD"), "includes replacement char"
|
138
|
+
assert value.include?("\uFFFD"), "includes replacement char"
|
116
139
|
# coalescing multiple replacement chars at end, could change
|
117
|
-
# to not do so, important thing is at least one is there.
|
140
|
+
# to not do so, important thing is at least one is there.
|
118
141
|
assert_equal "米国の統治の仕組�", value
|
119
142
|
end
|
120
143
|
|
@@ -150,5 +173,5 @@ if "".respond_to?(:encoding)
|
|
150
173
|
end
|
151
174
|
else
|
152
175
|
require 'pathname'
|
153
|
-
$stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
|
176
|
+
$stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
|
154
177
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: marc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin Clarke
|
@@ -13,7 +13,7 @@ authors:
|
|
13
13
|
autorequire: marc
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
|
-
date: 2019-
|
16
|
+
date: 2019-06-28 00:00:00.000000000 Z
|
17
17
|
dependencies:
|
18
18
|
- !ruby/object:Gem::Dependency
|
19
19
|
name: scrub_rb
|
@@ -131,8 +131,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
131
131
|
- !ruby/object:Gem::Version
|
132
132
|
version: '0'
|
133
133
|
requirements: []
|
134
|
-
|
135
|
-
rubygems_version: 2.7.6
|
134
|
+
rubygems_version: 3.0.3
|
136
135
|
signing_key:
|
137
136
|
specification_version: 4
|
138
137
|
summary: A ruby library for working with Machine Readable Cataloging
|