marc 0.7.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,11 @@
1
+ require 'test/unit'
2
+ require 'marc'
3
+ require 'marc/marc8/map_to_unicode'
4
+
5
+ class TestMarc8Mapping < Test::Unit::TestCase
6
+ def test_codesets_just_exist
7
+ assert MARC::Marc8::MapToUnicode::CODESETS
8
+ assert MARC::Marc8::MapToUnicode::CODESETS[0x34]
9
+ assert MARC::Marc8::MapToUnicode::CODESETS[0x34][0xa1]
10
+ end
11
+ end
@@ -0,0 +1,154 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'test/unit'
4
+ require 'marc'
5
+
6
+ require 'marc/marc8/to_unicode'
7
+
8
+ require 'unf'
9
+
10
+ if "".respond_to?(:encoding)
11
+
12
+
13
+ class TestMarc8ToUnicode < Test::Unit::TestCase
14
+ def test_empty_string
15
+ value = MARC::Marc8::ToUnicode.new.transcode("")
16
+ assert_equal "UTF-8", value.encoding.name
17
+ assert_equal "", value
18
+
19
+ value = MARC::Marc8::ToUnicode.new.transcode(nil)
20
+ assert_equal "UTF-8", value.encoding.name
21
+ assert_equal "", value
22
+ end
23
+
24
+ def test_one_example_marc8
25
+ value = MARC::Marc8::ToUnicode.new.transcode("Conversa\xF0c\xE4ao")
26
+ assert_equal "UTF-8", value.encoding.name
27
+
28
+ expected = UNF::Normalizer.normalize("Conversação", :nfc)
29
+
30
+ assert_equal expected, value
31
+ end
32
+
33
+ def test_lots_of_marc8_test_cases
34
+ # Heap of test cases taken from pymarc, which provided these
35
+ # two data files, marc8 and utf8, with line-by-line correspondences.
36
+ #
37
+ # For now, we have NOT included proprietary III encodings in our test data!
38
+ utf8_file = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
39
+ marc8_file = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
40
+
41
+ i = 0
42
+ converter = MARC::Marc8::ToUnicode.new
43
+
44
+ begin
45
+ while true do
46
+ i += 1
47
+
48
+ utf8 = utf8_file.readline.chomp
49
+ marc8 = marc8_file.readline.chomp
50
+
51
+ converted = converter.transcode(marc8)
52
+
53
+ assert_equal "UTF-8", converted.encoding.name, "Converted data line #{i} is tagged UTF-8"
54
+ assert converted.valid_encoding?, "Converted data line #{i} is valid_encoding"
55
+
56
+ assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
57
+ end
58
+ rescue EOFError => each
59
+ # just means the file was over, no biggie
60
+ assert i > 1500, "Read as many lines as we expected to, at least 1500"
61
+ rescue Exception => e
62
+ $stderr.puts "Error at test data line #{i}"
63
+ raise e
64
+ end
65
+ end
66
+
67
+ def test_explicit_normalization
68
+ # \xC1 is Marc8 "script small letter l", which under unicode
69
+ # COMPAT normalization will turn into ordinary 'l'
70
+ marc8 = "Conversa\xF0c\xE4ao \xC1"
71
+ unicode = "Conversação \u2113"
72
+
73
+ unicode_c = UNF::Normalizer.normalize(unicode, :nfc)
74
+ unicode_kc = UNF::Normalizer.normalize(unicode, :nfkc)
75
+ unicode_d = UNF::Normalizer.normalize(unicode, :nfd)
76
+ unicode_kd = UNF::Normalizer.normalize(unicode, :nfkd)
77
+
78
+ converter = MARC::Marc8::ToUnicode.new
79
+
80
+ assert_equal unicode_c, converter.transcode(marc8, :normalization => :nfc)
81
+ assert_equal unicode_kc, converter.transcode(marc8, :normalization => :nfkc)
82
+ assert_equal unicode_d, converter.transcode(marc8, :normalization => :nfd)
83
+ assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
84
+
85
+ # disable normalization for performance or something, we won't end up with NFC.
86
+ refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
87
+ end
88
+
89
+ def test_expand_ncr
90
+ converter = MARC::Marc8::ToUnicode.new
91
+
92
+ marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
93
+ assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
94
+ assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
95
+ end
96
+
97
+ def test_bad_byte
98
+ converter = MARC::Marc8::ToUnicode.new
99
+
100
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
101
+ assert_raise(Encoding::InvalidByteSequenceError) {
102
+ value = converter.transcode(bad_marc8)
103
+ }
104
+ end
105
+
106
+ def test_bad_byte_with_replacement
107
+ converter = MARC::Marc8::ToUnicode.new
108
+
109
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
110
+ value = converter.transcode(bad_marc8, :invalid => :replace)
111
+
112
+ assert_equal "UTF-8", value.encoding.name
113
+ assert value.valid_encoding?
114
+
115
+ assert value.include?("\uFFFD"), "includes replacement char"
116
+ # coalescing multiple replacement chars at end, could change
117
+ # to not do so, important thing is at least one is there.
118
+ assert_equal "米国の統治の仕組�", value
119
+ end
120
+
121
+ def test_bad_byte_with_specified_empty_replacement
122
+ converter = MARC::Marc8::ToUnicode.new
123
+
124
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
125
+ value = converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
126
+
127
+ assert_equal "UTF-8", value.encoding.name
128
+ assert value.valid_encoding?
129
+
130
+ assert_equal "米国の統治の仕組", value
131
+ end
132
+
133
+ def test_bad_escape
134
+ converter = MARC::Marc8::ToUnicode.new
135
+
136
+ # I do not understand what's going on here, or why this is
137
+ # desired/expected behavior. But this
138
+ # test is copied from pymarc , adapted to be straight data not marc record
139
+ # https://github.com/edsu/pymarc/blob/master/test/marc8.py?source=cc#L34
140
+
141
+ bad_escape_data = "La Soci\xE2et\e,"
142
+ value = converter.transcode(bad_escape_data)
143
+
144
+ assert_equal "UTF-8", value.encoding.name
145
+ assert value.valid_encoding?, "Valid encoding"
146
+
147
+ assert_equal "La Soci\u00E9t\x1B,", value
148
+ end
149
+
150
+ end
151
+ else
152
+ require 'pathname'
153
+ $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
154
+ end
@@ -0,0 +1,40 @@
1
+ 01161cam a2200289 4500
2
+ 001 178448
3
+ 008 s1996 xx spa d
4
+ 035 $a X!b
5
+ 049 $a JHWV [AV] [NIRC] $n o
6
+ 096 $a WY 20.5 VC6 1996
7
+ 110 2 $a National Institutes of Health (U.S.)
8
+ 110 2 $a National Institute of Nursing Research (U.S.)#
9
+ (No separator at end of field length=51)
10
+ (Bad indicator data. Skipping 2 bytes)
11
+ 110 2 $a Department of Health & Human Services (U.S.)
12
+ (Bad indicator data. Skipping 2 bytes)
13
+ 245 0 $a Ten years at NIH : $b advancing health through science : the human dimension / $c Patricia A. Grady, Harold Varmus.
14
+ (Bad indicator data. Skipping 2 bytes)
15
+ 246 $a 10 years at NIH
16
+ (Bad indicator data. Skipping 2 bytes)
17
+ 300 $a 2 videocassettes (229 min.) : $b sd., col. ; $c 1/2 in.
18
+ (Bad indicator data. Skipping 2 bytes)
19
+ 520 $a A series of speakers recounts advances in nursing research from 1986 to 1996. ˜
20
+ (Bad indicator data. Skipping 2 bytes)
21
+ 538 $a VHS.
22
+ (Bad indicator data. Skipping 2 bytes)
23
+ 650 2 $a Nursing Care $x videocassettes
24
+ (Bad indicator data. Skipping 2 bytes)
25
+ 650 2 $a Nursing Research $x videocassettes
26
+ (Bad indicator data. Skipping 2 bytes)
27
+ 650 2 $a Nursing $x videocassettes
28
+ (Bad indicator data. Skipping 2 bytes)
29
+ 700 10 $a Grady, Patricia Anne, $d 1943-
30
+ (Bad indicator data. Skipping 2 bytes)
31
+ 700 1 $a Varmus, Harold
32
+ (Bad indicator data. Skipping 2 bytes)
33
+ 910 $a 178448 $b Horizon bib#
34
+ (Bad indicator data. Skipping 2 bytes)
35
+ 949 31 $7 1 $5 WY 20.5 VC6 1996 $0 26 $0 G $2 A $8 5 $4 1
36
+ (Bad indicator data. Skipping 2 bytes)
37
+ 991 $a WY 20.5 VC6 1996 $f nlm $b wnlm $c c. 1 $q 0 $i 3199765 $l wempbk $m elsc
38
+ (Bad indicator data. Skipping 2 bytes)
39
+ 991 $a WY 20.5 VC6 1996 $f nlm $b wnlm $c c. 1 $q 0 $i 3199766 $l wempbk $m elsc
40
+
@@ -0,0 +1 @@
1
+ 01161cam a2200289 4500001000700000008004100007035000800048049002400056096002100080110004100101110005100142110004900193245011600242246002000358300005600378520008500434538000900519650003300528650003700561650002800598700003300626700001900659910002500678949004000703991006400743991006400807178448 s1996 xx spa d aX!b aJHWV [AV] [NIRC]no aWY 20.5 VC6 19962 aNational Institutes of Health (U.S.)2 aNational Institute of Nursing Research (U.S.)#7;2 aDepartment of Health & Human Services (U.S.) 0aTen years at NIH :badvancing health through science : the human dimension /cPatricia A. Grady, Harold Varmus. a10 years at NIH a2 videocassettes (229 min.) :bsd., col. ;c1/2 in. aA series of speakers recounts advances in nursing research from 1986 to 1996. ˜ aVHS. 2aNursing Carexvideocassettes 2aNursing Researchxvideocassettes 2aNursingxvideocassettes10aGrady, Patricia Anne,d1943-1 aVarmus, Harold a178448bHorizon bib#31715WY 20.5 VC6 19960260G2A8541 aWY 20.5 VC6 1996fnlmbwnlmcc. 1q0i3199765lwempbkmelsc aWY 20.5 VC6 1996fnlmbwnlmcc. 1q0i3199766lwempbkmelsc
@@ -44,7 +44,24 @@ if "".respond_to?(:encoding)
44
44
  assert_equal(encoding, record['001'].value.encoding.name)
45
45
  assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
46
46
  end
47
+
48
+ @@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
47
49
 
50
+
51
+ def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
52
+ record.fields.each do |field|
53
+ if field.kind_of? MARC::DataField
54
+ field.subfields.each do |sf|
55
+ assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
56
+ assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
57
+ end
58
+ else
59
+ assert_equal encoding_name, field.value.encoding.name, "Is tagged #{encoding_name}: #{field}"
60
+ assert field.value.valid_encoding?, "Is valid encoding: #{field}"
61
+ end
62
+ end
63
+ end
64
+
48
65
  ####
49
66
  # end helper methods
50
67
  ####
@@ -100,13 +117,57 @@ if "".respond_to?(:encoding)
100
117
  end
101
118
 
102
119
  def test_marc8_with_binary
103
- # Marc8, best we can do is read it in binary.
120
+ # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
104
121
  reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
105
122
  record = reader.first
106
123
 
107
124
  assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
108
125
  end
109
-
126
+
127
+ def test_marc8_converted_to_unicode
128
+ reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'MARC-8')
129
+ record = reader.first
130
+
131
+ assert_all_values_valid_encoding(record)
132
+
133
+ assert_equal "Serreau, Geneviève.", record['100']['a']
134
+ end
135
+
136
+ def test_marc8_converted_to_unicode_with_file_handle
137
+ # had some trouble with this one, let's ensure it with a test
138
+ file = File.new('test/marc8_accented_chars.marc')
139
+ reader = MARC::Reader.new(file, :external_encoding => "MARC-8")
140
+ record = reader.first
141
+
142
+ assert_all_values_valid_encoding(record)
143
+ end
144
+
145
+ def test_marc8_with_char_entity
146
+ reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :external_encoding => "MARC-8")
147
+ record = reader.first
148
+
149
+ assert_all_values_valid_encoding(record)
150
+
151
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
152
+ end
153
+
154
+ def test_bad_marc8_raises
155
+ assert_raise(Encoding::InvalidByteSequenceError) do
156
+ reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8')
157
+ record = reader.first
158
+ end
159
+ end
160
+
161
+ def test_bad_marc8_with_replacement
162
+ reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8', :invalid => :replace, :replace => "[?]")
163
+ record = reader.first
164
+
165
+ assert_all_values_valid_encoding(record)
166
+
167
+ assert record['880']['a'].include?("[?]"), "includes specified replacement string"
168
+ end
169
+
170
+
110
171
  def test_load_file_opened_with_external_encoding
111
172
  reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
112
173
 
@@ -125,12 +186,38 @@ if "".respond_to?(:encoding)
125
186
  end
126
187
 
127
188
  def test_from_string_with_utf8_encoding
128
- marc_string = File.open(@@utf_marc_path).read.force_encoding("UTF-8")
189
+ marc_file = File.open(@@utf_marc_path)
129
190
 
130
- reader = MARC::Reader.new(StringIO.new(marc_string))
191
+ reader = MARC::Reader.new(marc_file)
131
192
  record = reader.first
132
193
 
133
- assert_utf8_right_in_utf8(record)
194
+
195
+
196
+
197
+ end
198
+
199
+ # Something that was failing in my client Blacklight code,
200
+ # bad bytes should be handled appropriately
201
+ def test_from_string_utf8_with_bad_byte
202
+ marc_file = File.open('test/marc_with_bad_utf8.utf8.marc')
203
+
204
+ reader = MARC::Reader.new(marc_file, :invalid => :replace)
205
+
206
+ record = reader.first
207
+
208
+ record.fields.each do |field|
209
+ if field.kind_of? MARC::ControlField
210
+ assert_equal "UTF-8", field.value.encoding.name
211
+ assert field.value.valid_encoding?
212
+ else
213
+ field.subfields.each do |subfield|
214
+ assert_equal "UTF-8", subfield.value.encoding.name
215
+ assert subfield.value.valid_encoding?, "value has valid encoding"
216
+ end
217
+ end
218
+ end
219
+
220
+ assert record['520']['a'].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
134
221
  end
135
222
 
136
223
  def test_from_string_with_cp866
metadata CHANGED
@@ -1,32 +1,66 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: marc
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.7.1
4
+ version: 0.8.0
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Kevin Clarke
9
9
  - Bill Dueber
10
10
  - William Groppe
11
+ - Jonathan Rochkind
11
12
  - Ross Singer
12
13
  - Ed Summers
13
14
  autorequire: marc
14
15
  bindir: bin
15
16
  cert_chain: []
16
- date: 2013-09-09 00:00:00.000000000 Z
17
- dependencies: []
18
- description:
17
+ date: 2013-11-20 00:00:00.000000000 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: ensure_valid_encoding
21
+ requirement: !ruby/object:Gem::Requirement
22
+ none: false
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ type: :runtime
28
+ prerelease: false
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ - !ruby/object:Gem::Dependency
36
+ name: unf
37
+ requirement: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ none: false
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ description:
19
52
  email: ehs@pobox.com
20
53
  executables: []
21
54
  extensions: []
22
55
  extra_rdoc_files: []
23
56
  files:
24
- - lib/marc.rb
25
57
  - lib/marc/constants.rb
26
58
  - lib/marc/controlfield.rb
27
59
  - lib/marc/datafield.rb
28
60
  - lib/marc/dublincore.rb
29
61
  - lib/marc/exception.rb
62
+ - lib/marc/marc8/map_to_unicode.rb
63
+ - lib/marc/marc8/to_unicode.rb
30
64
  - lib/marc/reader.rb
31
65
  - lib/marc/record.rb
32
66
  - lib/marc/subfield.rb
@@ -36,12 +70,24 @@ files:
36
70
  - lib/marc/xml_parsers.rb
37
71
  - lib/marc/xmlreader.rb
38
72
  - lib/marc/xmlwriter.rb
73
+ - lib/marc.rb
74
+ - test/bad_eacc_encoding.marc8.marc
39
75
  - test/batch.dat
40
76
  - test/batch.xml
77
+ - test/bib178448.okay.human
78
+ - test/bib178448.okay.marc
79
+ - test/bib178448.writtenout.marc
41
80
  - test/cp866_multirecord.marc
42
81
  - test/cp866_unimarc.marc
82
+ - test/escaped_character_reference.marc8.marc
43
83
  - test/hebrew880s.marc
84
+ - test/marc8/data/test_marc8.txt
85
+ - test/marc8/data/test_utf8.txt
86
+ - test/marc8/tc_marc8_mapping.rb
87
+ - test/marc8/tc_to_unicode.rb
44
88
  - test/marc8_accented_chars.marc
89
+ - test/marc_with_bad_utf8.utf8.human
90
+ - test/marc_with_bad_utf8.utf8.marc
45
91
  - test/no-leading-zero.xml
46
92
  - test/non-numeric.dat
47
93
  - test/non-numeric.xml
@@ -59,7 +105,6 @@ files:
59
105
  - test/tc_reader_char_encodings.rb
60
106
  - test/tc_record.rb
61
107
  - test/tc_subfield.rb
62
- - test/tc_weird_jruby_bytes.rb
63
108
  - test/tc_writer.rb
64
109
  - test/tc_xml.rb
65
110
  - test/ts_marc.rb
@@ -73,27 +118,28 @@ files:
73
118
  homepage: https://github.com/ruby-marc/ruby-marc/
74
119
  licenses:
75
120
  - MIT
76
- post_install_message:
121
+ post_install_message:
77
122
  rdoc_options: []
78
123
  require_paths:
79
124
  - lib
80
125
  required_ruby_version: !ruby/object:Gem::Requirement
126
+ none: false
81
127
  requirements:
82
- - - '>='
128
+ - - ! '>='
83
129
  - !ruby/object:Gem::Version
84
130
  version: 1.8.6
85
- none: false
86
131
  required_rubygems_version: !ruby/object:Gem::Requirement
132
+ none: false
87
133
  requirements:
88
- - - '>='
134
+ - - ! '>='
89
135
  - !ruby/object:Gem::Version
90
136
  version: '0'
91
- none: false
92
137
  requirements: []
93
- rubyforge_project:
94
- rubygems_version: 1.8.24
95
- signing_key:
138
+ rubyforge_project:
139
+ rubygems_version: 1.8.23
140
+ signing_key:
96
141
  specification_version: 3
97
142
  summary: A ruby library for working with Machine Readable Cataloging
98
143
  test_files:
99
144
  - test/ts_marc.rb
145
+ has_rdoc: true