marc 0.4.4 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Changes +13 -0
- data/README.md +88 -0
- data/Rakefile +2 -26
- data/lib/marc.rb +1 -1
- data/lib/marc/reader.rb +270 -50
- data/lib/marc/version.rb +3 -0
- data/lib/marc/writer.rb +11 -3
- data/test/bare_cp866.txt +1 -0
- data/test/cp866_multirecord.marc +1 -0
- data/test/cp866_unimarc.marc +1 -0
- data/test/jruby_bad_transcode.rb +52 -0
- data/test/jruby_just_string.rb +39 -0
- data/test/marc8_accented_chars.marc +1 -0
- data/test/tc_bare_ruby_strings.rb +43 -0
- data/test/tc_reader.rb +21 -6
- data/test/tc_reader_char_encodings.rb +256 -0
- data/test/tc_writer.rb +14 -2
- data/test/test_cp866.txt +1 -0
- data/test/{000039829.marc → utf8.marc} +0 -0
- data/test/utf8_multirecord.marc +1 -0
- data/test/utf8_with_bad_bytes.marc +1 -0
- metadata +73 -41
- data/README +0 -55
- data/test/t +0 -1
data/lib/marc/version.rb
ADDED
data/lib/marc/writer.rb
CHANGED
@@ -55,7 +55,9 @@ module MARC
|
|
55
55
|
field_data += END_OF_FIELD
|
56
56
|
|
57
57
|
# calculate directory entry for the field
|
58
|
-
field_length = field_data.
|
58
|
+
field_length = (field_data.respond_to?(:bytesize) ?
|
59
|
+
field_data.bytesize() :
|
60
|
+
field_data.length())
|
59
61
|
directory += sprintf("%03s%04i%05i", field.tag, field_length,
|
60
62
|
offset)
|
61
63
|
|
@@ -73,10 +75,16 @@ module MARC
|
|
73
75
|
marc = base + fields + END_OF_RECORD
|
74
76
|
|
75
77
|
# update leader with the byte offest to the end of the directory
|
76
|
-
marc[12..16] = sprintf("%05i", base.
|
78
|
+
marc[12..16] = sprintf("%05i", (base.respond_to?(:bytesize) ?
|
79
|
+
base.bytesize() :
|
80
|
+
base.length() )
|
81
|
+
)
|
77
82
|
|
78
83
|
# update the record length
|
79
|
-
marc[0..4] = sprintf("%05i", marc.
|
84
|
+
marc[0..4] = sprintf("%05i", (marc.respond_to?(:bytesize) ?
|
85
|
+
marc.bytesize() :
|
86
|
+
marc.length())
|
87
|
+
)
|
80
88
|
|
81
89
|
# store updated leader in the record that was passed in
|
82
90
|
record.leader = marc[0..LEADER_LENGTH-1]
|
data/test/bare_cp866.txt
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
����㭠�. ������ ��� ����⬠��
|
@@ -0,0 +1 @@
|
|
1
|
+
011670000000002290004500001001600000003003300016004000500049005001200054006003400066007000500100021012500105035000300230036001000233042000300243043001000246076000800256100064200264501000500906504000900911507000300920514001400923����⥭�� �. �.����㭠�. ������ ��� ����⬠�����.07.01-01�.1�ਡ�������%�����१�����2005��⮬�⨧��� ������� � �����⮢�� �ந�����⢠ ��㪮������த�樨 � ������ ���ଠ樮���� ����࠭�⢥ �।�����01501.01.13RU�. 56, 58~N 9-10�࣠����� ᥬ���� ����㯨�� �������� _5��-���_6, ����� ���������� ����७��� ���ଠ樮���� ���������� �孮����� � ��ᨩ� ��諥�����. �।�⠢��� ��� CRAFT (Cooperative Research Action for Technology - ᮢ����� ��� ��������� ��� ᮧ����� �孮�����), ��䨭������� ��, � ��� ������ � ��ࠫ� �த��� Cimatron. ����� ��� �뫮 ࠧ��⨥ ����� �孮����� � �।�� �����१�����. ����室������ ࠡ�� � ������ ������ �뫠 �������� ⥬, �� ᥣ���� �� ����� �����⥩ (���ਬ��, �������ࣨ�, ��⨪�, ᥭ�ਪ�, � ��㣨�) �ॡ��� �ਬ������ ����⬠�ᮢ�� ��⠫�� �������᪨� ࠧ��AB0150.01.1301AB2007-������011670000000002290004500001001600000003003300016004000500049005001200054006003400066007000500100021012500105035000300230036001000233042000300243043001000246076000800256100064200264501000500906504000900911507000300920514001400923����⥭�� �. �.����㭠�. ������ ��� ����⬠�����.07.01-01�.1�ਡ�������%�����१�����2005��⮬�⨧��� ������� � �����⮢�� �ந�����⢠ ��㪮������த�樨 � ������ ���ଠ樮���� ����࠭�⢥ �।�����01501.01.13RU�. 56, 58~N 9-10�࣠����� ᥬ���� ����㯨�� �������� _5��-���_6, ����� ���������� ����७��� ���ଠ樮���� ���������� �孮����� � ��ᨩ� ��諥�����. �।�⠢��� ��� CRAFT (Cooperative Research Action for Technology - ᮢ����� ��� ��������� ��� ᮧ����� �孮�����), ��䨭������� ��, � ��� ������ � ��ࠫ� �த��� Cimatron. ����� ��� �뫮 ࠧ��⨥ ����� �孮����� � �।�� �����१�����. ����室������ ࠡ�� � ������ ������ �뫠 �������� ⥬, �� ᥣ���� �� ����� �����⥩ (���ਬ��, �������ࣨ�, ��⨪�, ᥭ�ਪ�, � ��㣨�) �ॡ��� �ਬ������ ����⬠�ᮢ�� ��⠫�� �������᪨� ࠧ��AB0150.01.1301AB2007-������
|
@@ -0,0 +1 @@
|
|
1
|
+
011670000000002290004500001001600000003003300016004000500049005001200054006003400066007000500100021012500105035000300230036001000233042000300243043001000246076000800256100064200264501000500906504000900911507000300920514001400923����⥭�� �. �.����㭠�. ������ ��� ����⬠�����.07.01-01�.1�ਡ�������%�����१�����2005��⮬�⨧��� ������� � �����⮢�� �ந�����⢠ ��㪮������த�樨 � ������ ���ଠ樮���� ����࠭�⢥ �।�����01501.01.13RU�. 56, 58~N 9-10�࣠����� ᥬ���� ����㯨�� �������� _5��-���_6, ����� ���������� ����७��� ���ଠ樮���� ���������� �孮����� � ��ᨩ� ��諥�����. �।�⠢��� ��� CRAFT (Cooperative Research Action for Technology - ᮢ����� ��� ��������� ��� ᮧ����� �孮�����), ��䨭������� ��, � ��� ������ � ��ࠫ� �த��� Cimatron. ����� ��� �뫮 ࠧ��⨥ ����� �孮����� � �।�� �����१�����. ����室������ ࠡ�� � ������ ������ �뫠 �������� ⥬, �� ᥣ���� �� ����� �����⥩ (���ਬ��, �������ࣨ�, ��⨪�, ᥭ�ਪ�, � ��㣨�) �ॡ��� �ਬ������ ����⬠�ᮢ�� ��⠫�� �������᪨� ࠧ��AB0150.01.1301AB2007-������
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# 1.9.3p0 :005 > 0x8D.chr.force_encoding("cp866").encode("UTF-8")
|
4
|
+
utf8 = "Н".force_encoding("UTF-8")
|
5
|
+
|
6
|
+
puts "There's a cyrillic letter that looks kinda like a capital H. Here's what it looks like in unicode: Н"
|
7
|
+
|
8
|
+
puts "In unicode, that's byte array: " + utf8.bytes.to_a.inspect
|
9
|
+
|
10
|
+
puts "We're gonna use String#encode to convert it to an IBM866 encoding, also known as cp866, an encoding sometimes used in Russia."
|
11
|
+
|
12
|
+
|
13
|
+
puts " `utf8.encode(\"IBM866\")`"
|
14
|
+
|
15
|
+
cp866 = utf8.encode("IBM866")
|
16
|
+
puts cp866.bytes.to_a.inspect
|
17
|
+
|
18
|
+
exit
|
19
|
+
|
20
|
+
puts
|
21
|
+
puts "In cp866, the actual bytes are: #{cp866_phrase.bytes.to_a.inspect}"
|
22
|
+
puts
|
23
|
+
|
24
|
+
puts "We're going to write the cp866 string to disk, using binary:binary to try and make sure we get the bytes to disk without transcoding."
|
25
|
+
|
26
|
+
write = File.open("test_cp866.txt", "w", :internal_encoding => "binary", :external_encoding => "binary")
|
27
|
+
write.puts cp866_phrase
|
28
|
+
write.close
|
29
|
+
puts
|
30
|
+
|
31
|
+
puts "Now we're going to read it in with a File object with external_encoding set to IBM866, but no internal_encoding set."
|
32
|
+
|
33
|
+
puts
|
34
|
+
puts "Make sure we have no default internal_encoding: " + Encoding.default_internal.nil?.inspect
|
35
|
+
|
36
|
+
read = File.open("test_cp866.txt", :external_encoding => "cp866")
|
37
|
+
puts
|
38
|
+
puts "Our ruby file object should have external_encoding of IBM866: " + read.external_encoding.inspect
|
39
|
+
puts " and internal_encoding nil: " + read.internal_encoding.inspect
|
40
|
+
|
41
|
+
puts
|
42
|
+
|
43
|
+
read_in_string = read.read
|
44
|
+
read.close
|
45
|
+
|
46
|
+
puts "The encoding of the string we read in should be IBM866: " + (read_in_string.encoding.name == "IBM866").inspect
|
47
|
+
|
48
|
+
puts
|
49
|
+
puts "And the bytes should be the very same bytes we wrote out (which are valid cp866) " + (read_in_string.bytes.to_a[0,3] == [140, 165, 166]).inspect + " (#{read_in_string.bytes.to_a})"
|
50
|
+
|
51
|
+
puts "The above is TRUE in MRI 1.9.3, but FALSE in jruby "
|
52
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# encoding: binary
|
2
|
+
|
3
|
+
# jruby 1.6.7 (ruby-1.9.2-p312) (2012-02-22 3e82bc8) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_20) [linux-amd64-java]
|
4
|
+
|
5
|
+
# There is a letter in cyrillic that looks kind of like a capital
|
6
|
+
# H. In the cp866 encoding (http://en.wikipedia.org/wiki/Code_page_866)
|
7
|
+
# it's represented by "\x8D" which is decimal 141.
|
8
|
+
#
|
9
|
+
# In ruby 1.9, it _ought_ to be possible to have those bytes
|
10
|
+
# in a string, and tell ruby it's cp866.
|
11
|
+
|
12
|
+
cp866 = "\x8D".force_encoding("IBM866")
|
13
|
+
|
14
|
+
# in MRI 1.9.3, if we inspect that, we get "\x8D", just like we expect.
|
15
|
+
# and if we look at #bytes.to_a, we get [141], just like we expect.
|
16
|
+
puts cp866.inspect
|
17
|
+
puts cp866.bytes.to_a.inspect
|
18
|
+
# However, in jruby if we #inspect instead of getting "\x8D",
|
19
|
+
# we get "\u008D" -- this is wrong, it's NOT that unicode codepoint.
|
20
|
+
# In jruby, bytes.to_a.inspect is still [141], it hasn't changed
|
21
|
+
# the bytes, but it's confused about what's going on.
|
22
|
+
|
23
|
+
# We see this encoding confusion demonstrated if we try
|
24
|
+
# a String#encode.
|
25
|
+
#
|
26
|
+
# MRI 1.9.3 is perfectly capable of transcoding this to UTF-8
|
27
|
+
|
28
|
+
utf8 = cp866.encode("UTF-8")
|
29
|
+
puts utf8.inspect # => in MRI displays cyrillic in terminal no prob
|
30
|
+
puts utf8.bytes.to_a.inspect # => in MRI [208, 157], proper bytes for utf8
|
31
|
+
|
32
|
+
# In jruby, puts utf8.inspect displays "\u008D", and
|
33
|
+
# utf8.bytes.to_a.inspect is [194, 141]. I don't know where the
|
34
|
+
# 191 came from, but it has NOT succesfully transcoded to utf8.
|
35
|
+
|
36
|
+
# In other cases, the #encode will actually raise an illegal byte
|
37
|
+
# exception if the original bytes were not legal for UTF8 (or UTF16?) --
|
38
|
+
# but the original bytes were not meant to be considered unicode at all.
|
39
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
01120nam a22003011 4500001001000000003000400010005001700014008004100031010001300072035002400085035002400109035002300133040002400156100002500180245003700205260003100242300001100273490002700284504003100311505032600342650005500668852003300723970001300756971000800769972001300777973001900790998000900809000004951MiU19880715000000.0880715|1966||||||| |||||||fre|u a67006971 a(RLIN)MIUG0344054-B a(CaOTULAS)159823738 a(OCoLC)ocm00344054 cODaWUdMiUdCStRLIN1 aSerreau, Genevi�eve.10aHistoire du "nouveau th�e�atre." a[Paris]bGallimardcc1966. a190 p.0 aCollection Id�ees, 104 aBibliographical footnotes.0 aQuelques vivants piliers.--L'imm�ediat apr�es-guerre.--La f�ete des mots.--Eug�ene Ionesco.--Arthur Adamov.--Samuel Beckett.--Jean Genet.--Jean Vauthier.--Georges Schehad�e.--La rel�eve de l'avant-garde.--Les metteurs en sc�ene du "nouveau th�e�atre."--Cr�eations des principaux metteurs en sc�ene du "nouveau th�e�atre." 0aFrench dramay20th centuryxHistory and criticism.1 aMiUbBUHRcGRADh842 S4817hi aBKbBook aMiU c20040625 aACbavail_circ s9665
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
|
3
|
+
class TestBareRubyStrings < Test::Unit::TestCase
|
4
|
+
|
5
|
+
# The file bare_cp866.txt has in it a phrase encoded in cp866,
|
6
|
+
# that if it were translated to utf8 would be:
|
7
|
+
# "Междунар. новости мира пластмасс\n"
|
8
|
+
#
|
9
|
+
# The first few bytes of that in utf8 are:
|
10
|
+
# "\xD0\x9C\xD0\xB5"
|
11
|
+
#
|
12
|
+
# In cp866 as it is on disk, it's first few bytes are "\x8C\xA5"
|
13
|
+
|
14
|
+
def test_read_cp866_with_external_encoding
|
15
|
+
return
|
16
|
+
file = File.open("test/bare_cp866.txt", "r:cp866")
|
17
|
+
string = file.read
|
18
|
+
|
19
|
+
assert_equal "IBM866", string.encoding.name
|
20
|
+
|
21
|
+
cp866_binary = string.dup.force_encoding("binary")
|
22
|
+
assert cp866_binary.start_with?( "\x8C\xA5".force_encoding("binary") )
|
23
|
+
|
24
|
+
transcoded = string.encode("UTF-8")
|
25
|
+
assert_equal "UTF-8", transcoded.encoding.name
|
26
|
+
|
27
|
+
utf8_binary = transcoded.dup.force_encoding("binary")
|
28
|
+
|
29
|
+
assert utf8_binary.start_with?( "\xD0\x9C\xD0\xB5".force_encoding("binary"))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_read_cp866_binary_all_the_way
|
33
|
+
# tell ruby to treat it as binary binary binary
|
34
|
+
file = File.open("test/bare_cp866.txt", :external_encoding => "binary", :internal_encoding => "binary")
|
35
|
+
|
36
|
+
string = file.read
|
37
|
+
|
38
|
+
# we should get the same bytes that were on disk, right?
|
39
|
+
assert string.start_with?( "\x8C\xA5".force_encoding("binary"))
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
end
|
data/test/tc_reader.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
1
3
|
require 'test/unit'
|
2
4
|
require 'marc'
|
3
5
|
|
@@ -17,12 +19,30 @@ class ReaderTest < Test::Unit::TestCase
|
|
17
19
|
assert_equal(10, count)
|
18
20
|
end
|
19
21
|
|
22
|
+
def test_loose_utf8
|
23
|
+
# This isn't actually a corrupt file, but it is utf8,
|
24
|
+
# and I have some reason to believe forgiving reader isn't
|
25
|
+
# working properly with UTF8 in ruby 1.9, so testing it.
|
26
|
+
reader = MARC::ForgivingReader.new('test/utf8.marc')
|
27
|
+
count = 0
|
28
|
+
reader.each { count += 1 }
|
29
|
+
assert_equal(1, count)
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_loose_unimarc
|
33
|
+
# Unimarc might use a different record seperator? Let's make sure it works.
|
34
|
+
reader = MARC::Reader.new(File.open('test/cp866_unimarc.marc', 'r:cp866'))
|
35
|
+
count = 0
|
36
|
+
reader.each {|a| count += 1 }
|
37
|
+
assert_equal(1, count)
|
38
|
+
end
|
39
|
+
|
20
40
|
def test_non_numeric_tags
|
21
41
|
reader = MARC::Reader.new('test/non-numeric.dat')
|
22
42
|
count = 0
|
23
43
|
record = nil
|
24
44
|
reader.each do | rec |
|
25
|
-
count += 1
|
45
|
+
count += 1
|
26
46
|
record = rec
|
27
47
|
end
|
28
48
|
assert_equal(1, count)
|
@@ -30,11 +50,6 @@ class ReaderTest < Test::Unit::TestCase
|
|
30
50
|
assert_equal('1', record['LOC']['9'])
|
31
51
|
end
|
32
52
|
|
33
|
-
def test_unicode_load
|
34
|
-
reader = MARC::Reader.new('test/000039829.marc')
|
35
|
-
assert_nothing_raised { reader.first }
|
36
|
-
end
|
37
|
-
|
38
53
|
def test_bad_marc
|
39
54
|
reader = MARC::Reader.new('test/tc_reader.rb')
|
40
55
|
assert_raises(MARC::Exception) {reader.entries[0]}
|
@@ -0,0 +1,256 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'marc'
|
5
|
+
|
6
|
+
# Testing char encodings under 1.9, don't bother running
|
7
|
+
# these tests except under 1.9, will either fail (because
|
8
|
+
# 1.9 func the test itself uses isn't there), or trivially pass
|
9
|
+
# (becuase the func they are testing is no-op on 1.9).
|
10
|
+
|
11
|
+
if "".respond_to?(:encoding)
|
12
|
+
|
13
|
+
class ReaderCharEncodingsTest < Test::Unit::TestCase
|
14
|
+
####
|
15
|
+
# Helper methods for our tests
|
16
|
+
#
|
17
|
+
####
|
18
|
+
|
19
|
+
|
20
|
+
@@utf_marc_path = 'test/utf8.marc'
|
21
|
+
# tests against record at test/utf8.marc
|
22
|
+
def assert_utf8_right_in_utf8(record)
|
23
|
+
assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
|
24
|
+
|
25
|
+
assert_equal "UTF-8", record['245'].to_s.encoding.name
|
26
|
+
|
27
|
+
assert_equal "UTF-8", record['245'].subfields.first.to_s.encoding.name
|
28
|
+
assert_equal "UTF-8", record['245'].subfields.first.value.encoding.name
|
29
|
+
|
30
|
+
assert_equal "UTF-8", record['245']['a'].encoding.name
|
31
|
+
assert record['245']['a'].start_with?("Photčhanānukrom")
|
32
|
+
end
|
33
|
+
|
34
|
+
# Test against multirecord just to be sure that works.
|
35
|
+
# the multirecord file is just two concatenated copies
|
36
|
+
# of the single one.
|
37
|
+
@@cp866_marc_path = "test/cp866_multirecord.marc"
|
38
|
+
# assumes record in test/cp866_unimarc.marc
|
39
|
+
# Pass in an encoding name, using ruby's canonical name!
|
40
|
+
# "IBM866" not "cp866". "UTF-8".
|
41
|
+
def assert_cp866_right(record, encoding = "IBM866")
|
42
|
+
assert_equal(encoding, record['001'].value.encoding.name)
|
43
|
+
assert_equal(["d09d"], record['001'].value.encode("UTF-8").unpack('H4')) # russian capital N
|
44
|
+
end
|
45
|
+
|
46
|
+
####
|
47
|
+
# end helper methods
|
48
|
+
####
|
49
|
+
|
50
|
+
|
51
|
+
def test_unicode_load
|
52
|
+
reader = MARC::Reader.new(@@utf_marc_path)
|
53
|
+
|
54
|
+
record = nil
|
55
|
+
|
56
|
+
assert_nothing_raised { record = reader.first }
|
57
|
+
|
58
|
+
assert_utf8_right_in_utf8(record)
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
def test_unicode_decode_forgiving
|
63
|
+
# two kinds of forgiving invocation, they shouldn't be different,
|
64
|
+
# but just in case they have slightly different code paths, test em
|
65
|
+
# too.
|
66
|
+
marc_string = File.open(@@utf_marc_path).read.force_encoding("utf-8")
|
67
|
+
record = MARC::Reader.decode(marc_string, :forgiving => true)
|
68
|
+
assert_utf8_right_in_utf8(record)
|
69
|
+
|
70
|
+
|
71
|
+
reader = MARC::ForgivingReader.new(@@utf_marc_path)
|
72
|
+
record = reader.first
|
73
|
+
assert_utf8_right_in_utf8(record)
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_unicode_forgiving_reader_passes_options
|
77
|
+
# Make sure ForgivingReader accepts same options as MARC::Reader
|
78
|
+
# We don't test them ALL though, just a sample.
|
79
|
+
# Tell it we're reading cp866, but trancode to utf8 for us.
|
80
|
+
reader = MARC::ForgivingReader.new(@@cp866_marc_path, :external_encoding => "cp866", :internal_encoding => "utf-8")
|
81
|
+
|
82
|
+
record = reader.first
|
83
|
+
|
84
|
+
assert_cp866_right(record, "UTF-8")
|
85
|
+
end
|
86
|
+
|
87
|
+
def test_explicit_encoding
|
88
|
+
reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'cp866')
|
89
|
+
|
90
|
+
assert_cp866_right(reader.first, "IBM866")
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_bad_encoding_name_input
|
94
|
+
reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => 'adadfadf')
|
95
|
+
assert_raises ArgumentError do
|
96
|
+
reader.first
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_marc8_with_binary
|
101
|
+
# Marc8, best we can do is read it in binary.
|
102
|
+
reader = MARC::Reader.new('test/marc8_accented_chars.marc', :external_encoding => 'binary')
|
103
|
+
record = reader.first
|
104
|
+
|
105
|
+
assert_equal "ASCII-8BIT", record['100'].subfields.first.value.encoding.name
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_load_file_opened_with_external_encoding
|
109
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
|
110
|
+
|
111
|
+
record = reader.first
|
112
|
+
# Make sure it's got the encoding it's supposed to.
|
113
|
+
|
114
|
+
assert_cp866_right(record, "IBM866")
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_explicit_encoding_beats_file_encoding
|
118
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:utf-8'), :external_encoding => "cp866")
|
119
|
+
|
120
|
+
record = reader.first
|
121
|
+
|
122
|
+
assert_cp866_right(record, "IBM866")
|
123
|
+
end
|
124
|
+
|
125
|
+
def test_from_string_with_utf8_encoding
|
126
|
+
marc_string = File.open(@@utf_marc_path).read.force_encoding("UTF-8")
|
127
|
+
|
128
|
+
reader = MARC::Reader.new(StringIO.new(marc_string))
|
129
|
+
record = reader.first
|
130
|
+
|
131
|
+
assert_utf8_right_in_utf8(record)
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_from_string_with_cp866
|
135
|
+
marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
|
136
|
+
|
137
|
+
reader = MARC::Reader.new(StringIO.new(marc_string))
|
138
|
+
record = reader.first
|
139
|
+
|
140
|
+
assert_cp866_right(record, "IBM866")
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_decode_from_string_with_cp866
|
144
|
+
marc_string = File.open(@@cp866_marc_path).read.force_encoding("cp866")
|
145
|
+
|
146
|
+
record = MARC::Reader.decode(marc_string)
|
147
|
+
|
148
|
+
assert_cp866_right(record, "IBM866")
|
149
|
+
end
|
150
|
+
|
151
|
+
def test_with_transcode
|
152
|
+
reader = MARC::Reader.new(@@cp866_marc_path,
|
153
|
+
:external_encoding => 'cp866',
|
154
|
+
:internal_encoding => 'UTF-8')
|
155
|
+
|
156
|
+
record = reader.first
|
157
|
+
|
158
|
+
assert_cp866_right(record, "UTF-8")
|
159
|
+
|
160
|
+
end
|
161
|
+
|
162
|
+
def test_with_binary_filehandle
|
163
|
+
# about to recommend this as a foolproof way to avoid
|
164
|
+
# ruby transcoding behind your back in docs, let's make
|
165
|
+
# sure it really works.
|
166
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, :external_encoding => "binary", :internal_encoding => "binary"),
|
167
|
+
:external_encoding => "IBM866")
|
168
|
+
|
169
|
+
record = reader.first
|
170
|
+
assert_cp866_right(record, "IBM866")
|
171
|
+
end
|
172
|
+
|
173
|
+
def test_with_bad_source_bytes
|
174
|
+
reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
|
175
|
+
:external_encoding => "UTF-8",
|
176
|
+
:validate_encoding => true)
|
177
|
+
|
178
|
+
assert_raise Encoding::InvalidByteSequenceError do
|
179
|
+
record = reader.first
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def test_bad_source_bytes_with_replace
|
184
|
+
reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
|
185
|
+
:external_encoding => "UTF-8", :invalid => :replace)
|
186
|
+
|
187
|
+
record = nil
|
188
|
+
assert_nothing_raised do
|
189
|
+
record = reader.first
|
190
|
+
end
|
191
|
+
|
192
|
+
# it should have the unicode replacement char where the bad
|
193
|
+
# byte was.
|
194
|
+
assert_match '=> ' + "\uFFFD" + '( <=', record['245']['a']
|
195
|
+
end
|
196
|
+
|
197
|
+
def test_bad_source_bytes_with_custom_replace
|
198
|
+
reader = MARC::Reader.new('test/utf8_with_bad_bytes.marc',
|
199
|
+
:external_encoding => "UTF-8", :invalid => :replace, :replace => '')
|
200
|
+
|
201
|
+
record = reader.first
|
202
|
+
|
203
|
+
# bad byte replaced with empty string, gone.
|
204
|
+
assert_match '=> ( <=', record['245']['a']
|
205
|
+
|
206
|
+
end
|
207
|
+
|
208
|
+
def test_default_internal_encoding
|
209
|
+
# Some people WILL be changing their Encoding.default_internal
|
210
|
+
# It's even recommended by wycats
|
211
|
+
# http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
|
212
|
+
# This will in some cases make ruby File object trans-code
|
213
|
+
# by default. Trans-coding a serial marc binary can change the
|
214
|
+
# byte count and mess it up.
|
215
|
+
#
|
216
|
+
# But at present, because of the way the Reader is implemented reading
|
217
|
+
# specific bytecounts, it _works_, although it does not _respect_
|
218
|
+
# Encoding.default_internal. That's the best we can do right now,
|
219
|
+
# thsi test is important to ensure it stays at least this good.
|
220
|
+
begin
|
221
|
+
original = Encoding.default_internal
|
222
|
+
Encoding.default_internal = "UTF-8"
|
223
|
+
|
224
|
+
reader = MARC::Reader.new(File.open(@@cp866_marc_path, 'r:cp866'))
|
225
|
+
|
226
|
+
record = reader.first
|
227
|
+
|
228
|
+
assert_cp866_right(record, "IBM866")
|
229
|
+
ensure
|
230
|
+
Encoding.default_internal = original
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def test_default_internal_encoding_with_string_arg
|
235
|
+
begin
|
236
|
+
original = Encoding.default_internal
|
237
|
+
Encoding.default_internal = "UTF-8"
|
238
|
+
|
239
|
+
reader = MARC::Reader.new(@@cp866_marc_path, :external_encoding => "cp866")
|
240
|
+
|
241
|
+
record = reader.first
|
242
|
+
|
243
|
+
assert_cp866_right(record, "IBM866")
|
244
|
+
ensure
|
245
|
+
Encoding.default_internal = original
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
|
253
|
+
else
|
254
|
+
require 'pathname'
|
255
|
+
$stderr.puts "\nTests not being run in ruby 1.9.x, skipping #{Pathname.new(__FILE__).basename}\n\n"
|
256
|
+
end
|