marc 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ require 'test/unit'
2
+ require 'marc'
3
+ require 'marc/marc8/map_to_unicode'
4
+
5
+ class TestMarc8Mapping < Test::Unit::TestCase
6
+ def test_codesets_just_exist
7
+ assert MARC::Marc8::MapToUnicode::CODESETS
8
+ assert MARC::Marc8::MapToUnicode::CODESETS[0x34]
9
+ assert MARC::Marc8::MapToUnicode::CODESETS[0x34][0xa1]
10
+ end
11
+ end
@@ -0,0 +1,154 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'test/unit'
4
+ require 'marc'
5
+
6
+ require 'marc/marc8/to_unicode'
7
+
8
+ require 'unf'
9
+
10
+ if "".respond_to?(:encoding)
11
+
12
+
13
+ class TestMarc8ToUnicode < Test::Unit::TestCase
14
+ def test_empty_string
15
+ value = MARC::Marc8::ToUnicode.new.transcode("")
16
+ assert_equal "UTF-8", value.encoding.name
17
+ assert_equal "", value
18
+
19
+ value = MARC::Marc8::ToUnicode.new.transcode(nil)
20
+ assert_equal "UTF-8", value.encoding.name
21
+ assert_equal "", value
22
+ end
23
+
24
+ def test_one_example_marc8
25
+ value = MARC::Marc8::ToUnicode.new.transcode("Conversa\xF0c\xE4ao")
26
+ assert_equal "UTF-8", value.encoding.name
27
+
28
+ expected = UNF::Normalizer.normalize("Conversação", :nfc)
29
+
30
+ assert_equal expected, value
31
+ end
32
+
33
+ def test_lots_of_marc8_test_cases
34
+ # Heap of test cases taken from pymarc, which provided these
35
+ # two data files, marc8 and utf8, with line-by-line correspondences.
36
+ #
37
+ # For now, we have NOT included proprietary III encodings in our test data!
38
+ utf8_file = File.open( File.expand_path("../data/test_utf8.txt", __FILE__), "r:UTF-8")
39
+ marc8_file = File.open( File.expand_path("../data/test_marc8.txt", __FILE__), "r:binary")
40
+
41
+ i = 0
42
+ converter = MARC::Marc8::ToUnicode.new
43
+
44
+ begin
45
+ while true do
46
+ i += 1
47
+
48
+ utf8 = utf8_file.readline.chomp
49
+ marc8 = marc8_file.readline.chomp
50
+
51
+ converted = converter.transcode(marc8)
52
+
53
+ assert_equal "UTF-8", converted.encoding.name, "Converted data line #{i} is tagged UTF-8"
54
+ assert converted.valid_encoding?, "Converted data line #{i} is valid_encoding"
55
+
56
+ assert_equal utf8, converted, "Test data line #{i}, expected converted to match provided utf8"
57
+ end
58
+ rescue EOFError => each
59
+ # just means the file was over, no biggie
60
+ assert i > 1500, "Read as many lines as we expected to, at least 1500"
61
+ rescue Exception => e
62
+ $stderr.puts "Error at test data line #{i}"
63
+ raise e
64
+ end
65
+ end
66
+
67
+ def test_explicit_normalization
68
+ # \xC1 is Marc8 "script small letter l", which under unicode
69
+ # COMPAT normalization will turn into ordinary 'l'
70
+ marc8 = "Conversa\xF0c\xE4ao \xC1"
71
+ unicode = "Conversação \u2113"
72
+
73
+ unicode_c = UNF::Normalizer.normalize(unicode, :nfc)
74
+ unicode_kc = UNF::Normalizer.normalize(unicode, :nfkc)
75
+ unicode_d = UNF::Normalizer.normalize(unicode, :nfd)
76
+ unicode_kd = UNF::Normalizer.normalize(unicode, :nfkd)
77
+
78
+ converter = MARC::Marc8::ToUnicode.new
79
+
80
+ assert_equal unicode_c, converter.transcode(marc8, :normalization => :nfc)
81
+ assert_equal unicode_kc, converter.transcode(marc8, :normalization => :nfkc)
82
+ assert_equal unicode_d, converter.transcode(marc8, :normalization => :nfd)
83
+ assert_equal unicode_kd, converter.transcode(marc8, :normalization => :nfkd)
84
+
85
+ # disable normalization for performance or something, we won't end up with NFC.
86
+ refute_equal unicode_c, converter.transcode(marc8, :normalization => nil)
87
+ end
88
+
89
+ def test_expand_ncr
90
+ converter = MARC::Marc8::ToUnicode.new
91
+
92
+ marc8_ncr = "Weird &#x200F; &#xFFFD; but these aren't changed #x2000; &#200F etc."
93
+ assert_equal "Weird \u200F \uFFFD but these aren't changed #x2000; &#200F etc.", converter.transcode(marc8_ncr)
94
+ assert_equal marc8_ncr, converter.transcode(marc8_ncr, :expand_ncr => false), "should not expand NCR if disabled"
95
+ end
96
+
97
+ def test_bad_byte
98
+ converter = MARC::Marc8::ToUnicode.new
99
+
100
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
101
+ assert_raise(Encoding::InvalidByteSequenceError) {
102
+ value = converter.transcode(bad_marc8)
103
+ }
104
+ end
105
+
106
+ def test_bad_byte_with_replacement
107
+ converter = MARC::Marc8::ToUnicode.new
108
+
109
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
110
+ value = converter.transcode(bad_marc8, :invalid => :replace)
111
+
112
+ assert_equal "UTF-8", value.encoding.name
113
+ assert value.valid_encoding?
114
+
115
+ assert value.include?("\uFFFD"), "includes replacement char"
116
+ # coalescing multiple replacement chars at end, could change
117
+ # to not do so, important thing is at least one is there.
118
+ assert_equal "米国の統治の仕組�", value
119
+ end
120
+
121
+ def test_bad_byte_with_specified_empty_replacement
122
+ converter = MARC::Marc8::ToUnicode.new
123
+
124
+ bad_marc8 = "\e$1!PVK7oi$N!Q1!G4i$N!0p!Q+{6924f6}\e(B"
125
+ value = converter.transcode(bad_marc8, :invalid => :replace, :replace => "")
126
+
127
+ assert_equal "UTF-8", value.encoding.name
128
+ assert value.valid_encoding?
129
+
130
+ assert_equal "米国の統治の仕組", value
131
+ end
132
+
133
+ def test_bad_escape
134
+ converter = MARC::Marc8::ToUnicode.new
135
+
136
+ # I do not understand what's going on here, or why this is
137
+ # desired/expected behavior. But this
138
+ # test is copied from pymarc , adapted to be straight data not marc record
139
+ # https://github.com/edsu/pymarc/blob/master/test/marc8.py?source=cc#L34
140
+
141
+ bad_escape_data = "La Soci\xE2et\e,"
142
+ value = converter.transcode(bad_escape_data)
143
+
144
+ assert_equal "UTF-8", value.encoding.name
145
+ assert value.valid_encoding?, "Valid encoding"
146
+
147
+ assert_equal "La Soci\u00E9t\x1B,", value
148
+ end
149
+
150
+ end
151
+ else
152
+ require 'pathname'
153
+ $stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
154
+ end
@@ -0,0 +1,40 @@
1
+ 01161cam a2200289 4500
2
+ 001 178448
3
+ 008 s1996 xx spa d
4
+ 035 $a X!b
5
+ 049 $a JHWV [AV] [NIRC] $n o
6
+ 096 $a WY 20.5 VC6 1996
7
+ 110 2 $a National Institutes of Health (U.S.)
8
+ 110 2 $a National Institute of Nursing Research (U.S.)#
9
+ (No separator at end of field length=51)
10
+ (Bad indicator data. Skipping 2 bytes)
11
+ 110 2 $a Department of Health & Human Services (U.S.)
12
+ (Bad indicator data. Skipping 2 bytes)
13
+ 245 0 $a Ten years at NIH : $b advancing health through science : the human dimension / $c Patricia A. Grady, Harold Varmus.
14
+ (Bad indicator data. Skipping 2 bytes)
15
+ 246 $a 10 years at NIH
16
+ (Bad indicator data. Skipping 2 bytes)
17
+ 300 $a 2 videocassettes (229 min.) : $b sd., col. ; $c 1/2 in.
18
+ (Bad indicator data. Skipping 2 bytes)
19
+ 520 $a A series of speakers recounts advances in nursing research from 1986 to 1996. ˜
20
+ (Bad indicator data. Skipping 2 bytes)
21
+ 538 $a VHS.
22
+ (Bad indicator data. Skipping 2 bytes)
23
+ 650 2 $a Nursing Care $x videocassettes
24
+ (Bad indicator data. Skipping 2 bytes)
25
+ 650 2 $a Nursing Research $x videocassettes
26
+ (Bad indicator data. Skipping 2 bytes)
27
+ 650 2 $a Nursing $x videocassettes
28
+ (Bad indicator data. Skipping 2 bytes)
29
+ 700 10 $a Grady, Patricia Anne, $d 1943-
30
+ (Bad indicator data. Skipping 2 bytes)
31
+ 700 1 $a Varmus, Harold
32
+ (Bad indicator data. Skipping 2 bytes)
33
+ 910 $a 178448 $b Horizon bib#
34
+ (Bad indicator data. Skipping 2 bytes)
35
+ 949 31 $7 1 $5 WY 20.5 VC6 1996 $0 26 $0 G $2 A $8 5 $4 1
36
+ (Bad indicator data. Skipping 2 bytes)
37
+ 991 $a WY 20.5 VC6 1996 $f nlm $b wnlm $c c. 1 $q 0 $i 3199765 $l wempbk $m elsc
38
+ (Bad indicator data. Skipping 2 bytes)
39
+ 991 $a WY 20.5 VC6 1996 $f nlm $b wnlm $c c. 1 $q 0 $i 3199766 $l wempbk $m elsc
40
+
@@ -0,0 +1 @@
1
+ 01161cam a2200289 4500001000700000008004100007035000800048049002400056096002100080110004100101110005100142110004900193245011600242246002000358300005600378520008500434538000900519650003300528650003700561650002800598700003300626700001900659910002500678949004000703991006400743991006400807178448 s1996 xx spa d aX!b aJHWV [AV] [NIRC]no aWY 20.5 VC6 19962 aNational Institutes of Health (U.S.)2 aNational Institute of Nursing Research (U.S.)#7;2 aDepartment of Health & Human Services (U.S.) 0aTen years at NIH :badvancing health through science : the human dimension /cPatricia A. Grady, Harold Varmus. a10 years at NIH a2 videocassettes (229 min.) :bsd., col. ;c1/2 in. aA series of speakers recounts advances in nursing research from 1986 to 1996. ˜ aVHS. 2aNursing Carexvideocassettes 2aNursing Researchxvideocassettes 2aNursingxvideocassettes10aGrady, Patricia Anne,d1943-1 aVarmus, Harold a178448bHorizon bib#31715WY 20.5 VC6 19960260G2A8541 aWY 20.5 VC6 1996fnlmbwnlmcc. 1q0i3199765lwempbkmelsc aWY 20.5 VC6 1996fnlmbwnlmcc. 1q0i3199766lwempbkmelsc
@@ -44,7 +44,24 @@ if "".respond_to?(:encoding)
44
44
  assert_equal(encoding, record['001'].value.encoding.name)
45
45
  assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
46
46
  end
47
+
48
+ @@bad_marc8_path = "test/bad_eacc_encoding.marc8.marc"
47
49
 
50
+
51
+ def assert_all_values_valid_encoding(record, encoding_name="UTF-8")
52
+ record.fields.each do |field|
53
+ if field.kind_of? MARC::DataField
54
+ field.subfields.each do |sf|
55
+ assert_equal encoding_name, sf.value.encoding.name, "Is tagged #{encoding_name}: #{field.tag}: #{sf}"
56
+ assert field.value.valid_encoding?, "Is valid encoding: #{field.tag}: #{sf}"
57
+ end
58
+ else
59
+ assert_equal encoding_name, field.value.encoding.name, "Is tagged #{encoding_name}: #{field}"
60
+ assert field.value.valid_encoding?, "Is valid encoding: #{field}"
61
+ end
62
+ end
63
+ end
64
+
48
65
  ####
49
66
  # end helper methods
50
67
  ####
@@ -100,13 +117,57 @@ if "".respond_to?(:encoding)
100
117
  end
101
118
 
102
119
  def test_marc8_with_binary
103
- # Marc8, best we can do is read it in binary.
120
+ # Marc8, if we want to keep it without transcoding, best we can do is read it in binary.
104
121
  reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
105
122
  record = reader.first
106
123
 
107
124
  assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
108
125
  end
109
-
126
+
127
+ def test_marc8_converted_to_unicode
128
+ reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'MARC-8')
129
+ record = reader.first
130
+
131
+ assert_all_values_valid_encoding(record)
132
+
133
+ assert_equal "Serreau, Geneviève.", record['100']['a']
134
+ end
135
+
136
+ def test_marc8_converted_to_unicode_with_file_handle
137
+ # had some trouble with this one, let's ensure it with a test
138
+ file = File.new('test/marc8_accented_chars.marc')
139
+ reader = MARC::Reader.new(file, :external_encoding => "MARC-8")
140
+ record = reader.first
141
+
142
+ assert_all_values_valid_encoding(record)
143
+ end
144
+
145
+ def test_marc8_with_char_entity
146
+ reader = MARC::Reader.new("test/escaped_character_reference.marc8.marc", :external_encoding => "MARC-8")
147
+ record = reader.first
148
+
149
+ assert_all_values_valid_encoding(record)
150
+
151
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
152
+ end
153
+
154
+ def test_bad_marc8_raises
155
+ assert_raise(Encoding::InvalidByteSequenceError) do
156
+ reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8')
157
+ record = reader.first
158
+ end
159
+ end
160
+
161
+ def test_bad_marc8_with_replacement
162
+ reader = MARC::Reader.new(@@bad_marc8_path, :external_encoding => 'MARC-8', :invalid => :replace, :replace => "[?]")
163
+ record = reader.first
164
+
165
+ assert_all_values_valid_encoding(record)
166
+
167
+ assert record['880']['a'].include?("[?]"), "includes specified replacement string"
168
+ end
169
+
170
+
110
171
  def test_load_file_opened_with_external_encoding
111
172
  reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
112
173
 
@@ -125,12 +186,38 @@ if "".respond_to?(:encoding)
125
186
  end
126
187
 
127
188
  def test_from_string_with_utf8_encoding
128
- marc_string = File.open(@@utf_marc_path).read.force_encoding("UTF-8")
189
+ marc_file = File.open(@@utf_marc_path)
129
190
 
130
- reader = MARC::Reader.new(StringIO.new(marc_string))
191
+ reader = MARC::Reader.new(marc_file)
131
192
  record = reader.first
132
193
 
133
- assert_utf8_right_in_utf8(record)
194
+
195
+
196
+
197
+ end
198
+
199
+ # Something that was failing in my client Blacklight code,
200
+ # bad bytes should be handled appropriately
201
+ def test_from_string_utf8_with_bad_byte
202
+ marc_file = File.open('test/marc_with_bad_utf8.utf8.marc')
203
+
204
+ reader = MARC::Reader.new(marc_file, :invalid => :replace)
205
+
206
+ record = reader.first
207
+
208
+ record.fields.each do |field|
209
+ if field.kind_of? MARC::ControlField
210
+ assert_equal "UTF-8", field.value.encoding.name
211
+ assert field.value.valid_encoding?
212
+ else
213
+ field.subfields.each do |subfield|
214
+ assert_equal "UTF-8", subfield.value.encoding.name
215
+ assert subfield.value.valid_encoding?, "value has valid encoding"
216
+ end
217
+ end
218
+ end
219
+
220
+ assert record['520']['a'].include?("\uFFFD"), "Value with bad byte now has Unicode Replacement Char"
134
221
  end
135
222
 
136
223
  def test_from_string_with_cp866
metadata CHANGED
@@ -1,32 +1,66 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: marc
3
3
  version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.7.1
4
+ version: 0.8.0
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Kevin Clarke
9
9
  - Bill Dueber
10
10
  - William Groppe
11
+ - Jonathan Rochkind
11
12
  - Ross Singer
12
13
  - Ed Summers
13
14
  autorequire: marc
14
15
  bindir: bin
15
16
  cert_chain: []
16
- date: 2013-09-09 00:00:00.000000000 Z
17
- dependencies: []
18
- description:
17
+ date: 2013-11-20 00:00:00.000000000 Z
18
+ dependencies:
19
+ - !ruby/object:Gem::Dependency
20
+ name: ensure_valid_encoding
21
+ requirement: !ruby/object:Gem::Requirement
22
+ none: false
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ type: :runtime
28
+ prerelease: false
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
34
+ version: '0'
35
+ - !ruby/object:Gem::Dependency
36
+ name: unf
37
+ requirement: !ruby/object:Gem::Requirement
38
+ none: false
39
+ requirements:
40
+ - - ! '>='
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ none: false
47
+ requirements:
48
+ - - ! '>='
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ description:
19
52
  email: ehs@pobox.com
20
53
  executables: []
21
54
  extensions: []
22
55
  extra_rdoc_files: []
23
56
  files:
24
- - lib/marc.rb
25
57
  - lib/marc/constants.rb
26
58
  - lib/marc/controlfield.rb
27
59
  - lib/marc/datafield.rb
28
60
  - lib/marc/dublincore.rb
29
61
  - lib/marc/exception.rb
62
+ - lib/marc/marc8/map_to_unicode.rb
63
+ - lib/marc/marc8/to_unicode.rb
30
64
  - lib/marc/reader.rb
31
65
  - lib/marc/record.rb
32
66
  - lib/marc/subfield.rb
@@ -36,12 +70,24 @@ files:
36
70
  - lib/marc/xml_parsers.rb
37
71
  - lib/marc/xmlreader.rb
38
72
  - lib/marc/xmlwriter.rb
73
+ - lib/marc.rb
74
+ - test/bad_eacc_encoding.marc8.marc
39
75
  - test/batch.dat
40
76
  - test/batch.xml
77
+ - test/bib178448.okay.human
78
+ - test/bib178448.okay.marc
79
+ - test/bib178448.writtenout.marc
41
80
  - test/cp866_multirecord.marc
42
81
  - test/cp866_unimarc.marc
82
+ - test/escaped_character_reference.marc8.marc
43
83
  - test/hebrew880s.marc
84
+ - test/marc8/data/test_marc8.txt
85
+ - test/marc8/data/test_utf8.txt
86
+ - test/marc8/tc_marc8_mapping.rb
87
+ - test/marc8/tc_to_unicode.rb
44
88
  - test/marc8_accented_chars.marc
89
+ - test/marc_with_bad_utf8.utf8.human
90
+ - test/marc_with_bad_utf8.utf8.marc
45
91
  - test/no-leading-zero.xml
46
92
  - test/non-numeric.dat
47
93
  - test/non-numeric.xml
@@ -59,7 +105,6 @@ files:
59
105
  - test/tc_reader_char_encodings.rb
60
106
  - test/tc_record.rb
61
107
  - test/tc_subfield.rb
62
- - test/tc_weird_jruby_bytes.rb
63
108
  - test/tc_writer.rb
64
109
  - test/tc_xml.rb
65
110
  - test/ts_marc.rb
@@ -73,27 +118,28 @@ files:
73
118
  homepage: https://github.com/ruby-marc/ruby-marc/
74
119
  licenses:
75
120
  - MIT
76
- post_install_message:
121
+ post_install_message:
77
122
  rdoc_options: []
78
123
  require_paths:
79
124
  - lib
80
125
  required_ruby_version: !ruby/object:Gem::Requirement
126
+ none: false
81
127
  requirements:
82
- - - '>='
128
+ - - ! '>='
83
129
  - !ruby/object:Gem::Version
84
130
  version: 1.8.6
85
- none: false
86
131
  required_rubygems_version: !ruby/object:Gem::Requirement
132
+ none: false
87
133
  requirements:
88
- - - '>='
134
+ - - ! '>='
89
135
  - !ruby/object:Gem::Version
90
136
  version: '0'
91
- none: false
92
137
  requirements: []
93
- rubyforge_project:
94
- rubygems_version: 1.8.24
95
- signing_key:
138
+ rubyforge_project:
139
+ rubygems_version: 1.8.23
140
+ signing_key:
96
141
  specification_version: 3
97
142
  summary: A ruby library for working with Machine Readable Cataloging
98
143
  test_files:
99
144
  - test/ts_marc.rb
145
+ has_rdoc: true