marc 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -0
- data/lib/marc/reader.rb +1 -1
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer-NEW.rb +108 -0
- data/lib/marc/writer.rb +62 -17
- data/test/tc_reader_char_encodings.rb +2 -0
- data/test/tc_writer.rb +98 -3
- metadata +3 -2
data/README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
[![Gem Version](https://badge.fury.io/rb/marc.png)](http://badge.fury.io/rb/marc)
|
1
2
|
[![Build Status](https://secure.travis-ci.org/ruby-marc/ruby-marc.png)](http://travis-ci.org/ruby-marc/ruby-marc)
|
2
3
|
|
3
4
|
marc is a ruby library for reading and writing MAchine Readable Cataloging
|
data/lib/marc/reader.rb
CHANGED
@@ -396,7 +396,7 @@ module MARC
|
|
396
396
|
# Like Reader ForgivingReader lets you read in a batch of MARC21 records
|
397
397
|
# but it does not use record lengths and field byte offets found in the
|
398
398
|
# leader and directory. It is not unusual to run across MARC records
|
399
|
-
# which have had their offsets
|
399
|
+
# which have had their offsets calculated wrong. In situations like this
|
400
400
|
# the vanilla Reader may fail, and you can try to use ForgivingReader.
|
401
401
|
#
|
402
402
|
# The one downside to this is that ForgivingReader will assume that the
|
data/lib/marc/version.rb
CHANGED
@@ -0,0 +1,108 @@
|
|
1
|
+
module MARC
|
2
|
+
|
3
|
+
# A class for writing MARC records as MARC21.
|
4
|
+
|
5
|
+
class Writer
|
6
|
+
|
7
|
+
# the constructor which you must pass a file path
|
8
|
+
# or an object that responds to a write message
|
9
|
+
|
10
|
+
def initialize(file)
|
11
|
+
if file.class == String
|
12
|
+
@fh = File.new(file,"w")
|
13
|
+
elsif file.respond_to?('write')
|
14
|
+
@fh = file
|
15
|
+
else
|
16
|
+
throw "must pass in file name or handle"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
# write a record to the file or handle
|
22
|
+
|
23
|
+
def write(record)
|
24
|
+
@fh.write(MARC::Writer.encode(record))
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# close underlying filehandle
|
29
|
+
|
30
|
+
def close
|
31
|
+
@fh.close
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# a static method that accepts a MARC::Record object
|
36
|
+
# and returns the record encoded as MARC21 in transmission format
|
37
|
+
|
38
|
+
def self.encode(record)
|
39
|
+
directory = ''
|
40
|
+
fields = ''
|
41
|
+
offset = 0
|
42
|
+
for field in record.fields
|
43
|
+
|
44
|
+
# encode the field
|
45
|
+
field_data = ''
|
46
|
+
if field.class == MARC::DataField
|
47
|
+
warn("Warn: Missing indicator") unless field.indicator1 && field.indicator2
|
48
|
+
field_data = (field.indicator1 || " ") + (field.indicator2 || " ")
|
49
|
+
for s in field.subfields
|
50
|
+
field_data += SUBFIELD_INDICATOR + s.code + s.value
|
51
|
+
end
|
52
|
+
elsif field.class == MARC::ControlField
|
53
|
+
field_data = field.value
|
54
|
+
end
|
55
|
+
field_data += END_OF_FIELD
|
56
|
+
|
57
|
+
# calculate directory entry for the field
|
58
|
+
field_length = (field_data.respond_to?(:bytesize) ?
|
59
|
+
field_data.bytesize() :
|
60
|
+
field_data.length())
|
61
|
+
directory += sprintf("%03s", field.tag) + format_byte_count(field_length, 4) + format_byte_count(offset)
|
62
|
+
|
63
|
+
|
64
|
+
# add field to data for other fields
|
65
|
+
fields += field_data
|
66
|
+
|
67
|
+
# update offset for next field
|
68
|
+
offset += field_length
|
69
|
+
end
|
70
|
+
|
71
|
+
# determine the base (leader + directory)
|
72
|
+
base = record.leader + directory + END_OF_FIELD
|
73
|
+
|
74
|
+
# determine complete record
|
75
|
+
marc = base + fields + END_OF_RECORD
|
76
|
+
|
77
|
+
# update leader with the byte offest to the end of the directory
|
78
|
+
marc[12..16] = format_byte_count(base.respond_to?(:bytesize) ?
|
79
|
+
base.bytesize() :
|
80
|
+
base.length()
|
81
|
+
)
|
82
|
+
|
83
|
+
# update the record length
|
84
|
+
marc[0..4] = format_byte_count(marc.respond_to?(:bytesize) ?
|
85
|
+
marc.bytesize() :
|
86
|
+
marc.length()
|
87
|
+
)
|
88
|
+
|
89
|
+
# store updated leader in the record that was passed in
|
90
|
+
record.leader = marc[0..LEADER_LENGTH-1]
|
91
|
+
|
92
|
+
# return encoded marc
|
93
|
+
return marc
|
94
|
+
end
|
95
|
+
|
96
|
+
def self.format_byte_count(number, num_digits=5)
|
97
|
+
formatted = sprintf("%0#{num_digits}i", number)
|
98
|
+
if formatted.length > num_digits
|
99
|
+
# uh, oh, we've exceeded our max. Either zero out
|
100
|
+
# or raise, depending on settings.
|
101
|
+
#formatted = sprintf("%0#{num_digits}i", "")
|
102
|
+
formatted = "9" * num_digits
|
103
|
+
end
|
104
|
+
return formatted
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
data/lib/marc/writer.rb
CHANGED
@@ -1,8 +1,30 @@
|
|
1
1
|
module MARC
|
2
2
|
|
3
|
-
# A class for writing MARC records as
|
4
|
-
|
3
|
+
# A class for writing MARC records as binary MARC (ISO 2709)
|
4
|
+
#
|
5
|
+
# == Too-long records
|
6
|
+
#
|
7
|
+
# The MARC binary format only allows records that are total 99999 bytes long,
|
8
|
+
# due to size of a length field in the record.
|
9
|
+
#
|
10
|
+
# By default, the Writer will raise a MARC::Exception when encountering
|
11
|
+
# in-memory records that are too long to be legally written out as ISO 2709
|
12
|
+
# binary.
|
13
|
+
|
14
|
+
# However, if you set `allow_oversized` to true, then the Writer will
|
15
|
+
# write these records out anyway, filling in any binary length/offset slots
|
16
|
+
# with all 0's, if they are not wide enough to hold the true value.
|
17
|
+
# While these records are illegal, they can still be read back in using
|
18
|
+
# the MARC::ForgivingReader, as well as other platform MARC readers
|
19
|
+
# in tolerant mode.
|
20
|
+
#
|
21
|
+
# If you set `allow_oversized` to false on the Writer, a MARC::Exception
|
22
|
+
# will be raised instead, if you try to write an oversized record.
|
23
|
+
#
|
24
|
+
# writer = Writer.new(some_path)
|
25
|
+
# writer.allow_oversized = true
|
5
26
|
class Writer
|
27
|
+
attr_accessor :allow_oversized
|
6
28
|
|
7
29
|
# the constructor which you must pass a file path
|
8
30
|
# or an object that responds to a write message
|
@@ -15,13 +37,14 @@ module MARC
|
|
15
37
|
else
|
16
38
|
throw "must pass in file name or handle"
|
17
39
|
end
|
40
|
+
self.allow_oversized = false
|
18
41
|
end
|
19
42
|
|
20
43
|
|
21
44
|
# write a record to the file or handle
|
22
45
|
|
23
46
|
def write(record)
|
24
|
-
@fh.write(MARC::Writer.encode(record))
|
47
|
+
@fh.write(MARC::Writer.encode(record, self.allow_oversized))
|
25
48
|
end
|
26
49
|
|
27
50
|
|
@@ -34,8 +57,10 @@ module MARC
|
|
34
57
|
|
35
58
|
# a static method that accepts a MARC::Record object
|
36
59
|
# and returns the record encoded as MARC21 in transmission format
|
37
|
-
|
38
|
-
|
60
|
+
#
|
61
|
+
# Second arg allow_oversized, default false, set to true
|
62
|
+
# to raise on MARC record that can't fit into ISO 2709.
|
63
|
+
def self.encode(record, allow_oversized = false)
|
39
64
|
directory = ''
|
40
65
|
fields = ''
|
41
66
|
offset = 0
|
@@ -58,8 +83,8 @@ module MARC
|
|
58
83
|
field_length = (field_data.respond_to?(:bytesize) ?
|
59
84
|
field_data.bytesize() :
|
60
85
|
field_data.length())
|
61
|
-
directory += sprintf("%03s
|
62
|
-
|
86
|
+
directory += sprintf("%03s", field.tag) + format_byte_count(field_length, allow_oversized, 4) + format_byte_count(offset, allow_oversized)
|
87
|
+
|
63
88
|
|
64
89
|
# add field to data for other fields
|
65
90
|
fields += field_data
|
@@ -75,22 +100,42 @@ module MARC
|
|
75
100
|
marc = base + fields + END_OF_RECORD
|
76
101
|
|
77
102
|
# update leader with the byte offest to the end of the directory
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
)
|
103
|
+
bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length()
|
104
|
+
marc[12..16] = format_byte_count(bytesize, allow_oversized)
|
105
|
+
|
82
106
|
|
83
107
|
# update the record length
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
)
|
88
|
-
|
108
|
+
bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length()
|
109
|
+
marc[0..4] = format_byte_count(bytesize, allow_oversized)
|
110
|
+
|
89
111
|
# store updated leader in the record that was passed in
|
90
112
|
record.leader = marc[0..LEADER_LENGTH-1]
|
91
113
|
|
92
114
|
# return encoded marc
|
93
|
-
return marc
|
115
|
+
return marc
|
94
116
|
end
|
117
|
+
|
118
|
+
# Formats numbers for insertion into marc binary slots.
|
119
|
+
# These slots only allow so many digits (and need to be left-padded
|
120
|
+
# with spaces to that number of digits). If the number
|
121
|
+
# is too big, either an exception will be raised, or
|
122
|
+
# we'll return all 0's to proper number of digits.
|
123
|
+
#
|
124
|
+
# first arg is number, second is boolean whether to allow oversized,
|
125
|
+
# third is max digits (default 5)
|
126
|
+
def self.format_byte_count(number, allow_oversized, num_digits=5)
|
127
|
+
formatted = sprintf("%0#{num_digits}i", number)
|
128
|
+
if formatted.length > num_digits
|
129
|
+
# uh, oh, we've exceeded our max. Either zero out
|
130
|
+
# or raise, depending on settings.
|
131
|
+
if allow_oversized
|
132
|
+
formatted = sprintf("%0#{num_digits}i", 0)
|
133
|
+
else
|
134
|
+
raise MARC::Exception.new("Can't write MARC record, as length/offset value of #{number} is too long for the #{num_digits} slot in binary format.")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
return formatted
|
138
|
+
end
|
139
|
+
|
95
140
|
end
|
96
141
|
end
|
data/test/tc_writer.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'marc'
|
3
3
|
|
4
|
+
require 'stringio'
|
5
|
+
|
4
6
|
class WriterTest < Test::Unit::TestCase
|
5
7
|
|
6
8
|
def test_writer
|
@@ -19,6 +21,99 @@ class WriterTest < Test::Unit::TestCase
|
|
19
21
|
# cleanup
|
20
22
|
File.unlink('test/writer.dat')
|
21
23
|
end
|
24
|
+
|
25
|
+
# Only in ruby 1.9
|
26
|
+
if "".respond_to?(:encoding)
|
27
|
+
def test_writer_bad_encoding
|
28
|
+
writer = MARC::Writer.new('test/writer.dat')
|
29
|
+
|
30
|
+
|
31
|
+
# MARC::Writer should just happily write out whatever bytes you give it, even
|
32
|
+
# mixing encodings that can't be mixed. We ran into an actual example mixing
|
33
|
+
# MARC8 (tagged ruby binary) and UTF8, we want it to be written out.
|
34
|
+
|
35
|
+
record = MARC::Record.new
|
36
|
+
|
37
|
+
record.append MARC::DataField.new('700', '0', ' ', ['a', "Nhouy Abhay,".force_encoding("BINARY")], ["c", "Th\xE5ao,".force_encoding("BINARY")], ["d", "1909-"])
|
38
|
+
record.append MARC::DataField.new('700', '0', ' ', ['a', "Somchin P\xF8\xE5o. Ngin,".force_encoding("BINARY")])
|
39
|
+
|
40
|
+
record.append MARC::DataField.new('100', '0', '0', ['a', "\xE5angkham. ".force_encoding("BINARY")])
|
41
|
+
record.append MARC::DataField.new('245', '1', '0', ['b', "chef-d'oeuvre de la litt\xE2erature lao".force_encoding("BINARY")])
|
42
|
+
|
43
|
+
# One in UTF8 and marked
|
44
|
+
record.append MARC::DataField.new('999', '0', '1', ['a', "chef-d'ocuvre de la littU+FFC3\U+FFA9rature".force_encoding("UTF-8")])
|
45
|
+
|
46
|
+
writer.write(record)
|
47
|
+
writer.close
|
48
|
+
|
49
|
+
ensure
|
50
|
+
File.unlink('test/writer.dat')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_write_too_long_iso2709
|
55
|
+
too_long_record = MARC::Record.new
|
56
|
+
1.upto(1001) do
|
57
|
+
too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
|
58
|
+
end
|
59
|
+
|
60
|
+
wbuffer = StringIO.new("", "w")
|
61
|
+
writer = MARC::Writer.new(wbuffer)
|
62
|
+
writer.allow_oversized = true
|
63
|
+
|
64
|
+
writer.write(too_long_record)
|
65
|
+
writer.close
|
66
|
+
|
67
|
+
assert_equal "00000", wbuffer.string.slice(0, 5), "zero'd out length bytes when too long"
|
68
|
+
|
69
|
+
rbuffer = StringIO.new(wbuffer.string.dup)
|
70
|
+
|
71
|
+
# Regular reader won't read our illegal record.
|
72
|
+
#assert_raise(NoMethodError) do
|
73
|
+
# reader = MARC::Reader.new(rbuffer)
|
74
|
+
# reader.first
|
75
|
+
#end
|
76
|
+
|
77
|
+
# Forgiving reader will, round trippable
|
78
|
+
new_record = MARC::Reader.decode(rbuffer.string, :forgiving => true)
|
79
|
+
assert_equal too_long_record, new_record, "Too long record round-trippable with forgiving mode"
|
80
|
+
|
81
|
+
# Test in the middle of a MARC file
|
82
|
+
good_record = MARC::Record.new
|
83
|
+
good_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A short record'])
|
84
|
+
wbuffer = StringIO.new("", "w")
|
85
|
+
writer = MARC::Writer.new(wbuffer)
|
86
|
+
writer.allow_oversized = true
|
87
|
+
|
88
|
+
writer.write(good_record)
|
89
|
+
writer.write(too_long_record)
|
90
|
+
writer.write(good_record)
|
91
|
+
|
92
|
+
rbuffer = StringIO.new(wbuffer.string.dup)
|
93
|
+
reader = MARC::ForgivingReader.new(rbuffer)
|
94
|
+
records = reader.to_a
|
95
|
+
|
96
|
+
assert_equal 3, records.length
|
97
|
+
assert_equal good_record, records[0]
|
98
|
+
assert_equal good_record, records[2]
|
99
|
+
assert_equal too_long_record, records[1]
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_raises_on_too_long_if_configured
|
103
|
+
too_long_record = MARC::Record.new
|
104
|
+
1.upto(1001) do
|
105
|
+
too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
|
106
|
+
end
|
107
|
+
|
108
|
+
wbuffer = StringIO.new("", "w")
|
109
|
+
writer = MARC::Writer.new(wbuffer)
|
110
|
+
|
111
|
+
assert_raise(MARC::Exception) do
|
112
|
+
writer.write too_long_record
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
22
117
|
|
23
118
|
def test_forgiving_writer
|
24
119
|
marc = "00305cam a2200133 a 4500001000700000003000900007005001700016008004100033008004100074035002500115245001700140909001000157909000400167\036635145\036UK-BiLMS\03620060329173705.0\036s1982iieng6 000 0 eng||\036060116|||||||||xxk eng||\036 \037a(UK-BiLMS)M0017366ZW\03600\037aTest record.\036 \037aa\037b\037c\036\037b0\036\035\000"
|
@@ -29,18 +124,18 @@ class WriterTest < Test::Unit::TestCase
|
|
29
124
|
end
|
30
125
|
|
31
126
|
def test_unicode_roundtrip
|
32
|
-
record = MARC::Reader.new('test/utf8.marc').first
|
127
|
+
record = MARC::Reader.new('test/utf8.marc', :external_encoding => "UTF-8").first
|
33
128
|
|
34
129
|
writer = MARC::Writer.new('test/writer.dat')
|
35
130
|
writer.write(record)
|
36
131
|
writer.close
|
37
132
|
|
38
|
-
read_back_record = MARC::Reader.new('test/writer.dat').first
|
133
|
+
read_back_record = MARC::Reader.new('test/writer.dat', :external_encoding => "UTF-8").first
|
39
134
|
|
40
135
|
# Make sure the one we wrote out then read in again
|
41
136
|
# is the same as the one we read the first time
|
42
137
|
# Looks like "==" is over-ridden to do that. Don't ever change, #==
|
43
|
-
|
138
|
+
assert_equal record, read_back_record, "Round-tripped record must equal original record"
|
44
139
|
end
|
45
140
|
|
46
141
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: marc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ authors:
|
|
13
13
|
autorequire: marc
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
|
-
date: 2013-
|
16
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
17
17
|
dependencies: []
|
18
18
|
description:
|
19
19
|
email: ehs@pobox.com
|
@@ -30,6 +30,7 @@ files:
|
|
30
30
|
- lib/marc/record.rb
|
31
31
|
- lib/marc/subfield.rb
|
32
32
|
- lib/marc/version.rb
|
33
|
+
- lib/marc/writer-NEW.rb
|
33
34
|
- lib/marc/writer.rb
|
34
35
|
- lib/marc/xml_parsers.rb
|
35
36
|
- lib/marc/xmlreader.rb
|