marc 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -0
- data/lib/marc/reader.rb +1 -1
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer-NEW.rb +108 -0
- data/lib/marc/writer.rb +62 -17
- data/test/tc_reader_char_encodings.rb +2 -0
- data/test/tc_writer.rb +98 -3
- metadata +3 -2
data/README.md
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
[](http://badge.fury.io/rb/marc)
|
1
2
|
[](http://travis-ci.org/ruby-marc/ruby-marc)
|
2
3
|
|
3
4
|
marc is a ruby library for reading and writing MAchine Readable Cataloging
|
data/lib/marc/reader.rb
CHANGED
@@ -396,7 +396,7 @@ module MARC
|
|
396
396
|
# Like Reader ForgivingReader lets you read in a batch of MARC21 records
|
397
397
|
# but it does not use record lengths and field byte offets found in the
|
398
398
|
# leader and directory. It is not unusual to run across MARC records
|
399
|
-
# which have had their offsets
|
399
|
+
# which have had their offsets calculated wrong. In situations like this
|
400
400
|
# the vanilla Reader may fail, and you can try to use ForgivingReader.
|
401
401
|
#
|
402
402
|
# The one downside to this is that ForgivingReader will assume that the
|
data/lib/marc/version.rb
CHANGED
@@ -0,0 +1,108 @@
|
|
1
|
+
module MARC
|
2
|
+
|
3
|
+
# A class for writing MARC records as MARC21.
|
4
|
+
|
5
|
+
class Writer
|
6
|
+
|
7
|
+
# the constructor which you must pass a file path
|
8
|
+
# or an object that responds to a write message
|
9
|
+
|
10
|
+
def initialize(file)
|
11
|
+
if file.class == String
|
12
|
+
@fh = File.new(file,"w")
|
13
|
+
elsif file.respond_to?('write')
|
14
|
+
@fh = file
|
15
|
+
else
|
16
|
+
throw "must pass in file name or handle"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
# write a record to the file or handle
|
22
|
+
|
23
|
+
def write(record)
|
24
|
+
@fh.write(MARC::Writer.encode(record))
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
# close underlying filehandle
|
29
|
+
|
30
|
+
def close
|
31
|
+
@fh.close
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# a static method that accepts a MARC::Record object
|
36
|
+
# and returns the record encoded as MARC21 in transmission format
|
37
|
+
|
38
|
+
def self.encode(record)
|
39
|
+
directory = ''
|
40
|
+
fields = ''
|
41
|
+
offset = 0
|
42
|
+
for field in record.fields
|
43
|
+
|
44
|
+
# encode the field
|
45
|
+
field_data = ''
|
46
|
+
if field.class == MARC::DataField
|
47
|
+
warn("Warn: Missing indicator") unless field.indicator1 && field.indicator2
|
48
|
+
field_data = (field.indicator1 || " ") + (field.indicator2 || " ")
|
49
|
+
for s in field.subfields
|
50
|
+
field_data += SUBFIELD_INDICATOR + s.code + s.value
|
51
|
+
end
|
52
|
+
elsif field.class == MARC::ControlField
|
53
|
+
field_data = field.value
|
54
|
+
end
|
55
|
+
field_data += END_OF_FIELD
|
56
|
+
|
57
|
+
# calculate directory entry for the field
|
58
|
+
field_length = (field_data.respond_to?(:bytesize) ?
|
59
|
+
field_data.bytesize() :
|
60
|
+
field_data.length())
|
61
|
+
directory += sprintf("%03s", field.tag) + format_byte_count(field_length, 4) + format_byte_count(offset)
|
62
|
+
|
63
|
+
|
64
|
+
# add field to data for other fields
|
65
|
+
fields += field_data
|
66
|
+
|
67
|
+
# update offset for next field
|
68
|
+
offset += field_length
|
69
|
+
end
|
70
|
+
|
71
|
+
# determine the base (leader + directory)
|
72
|
+
base = record.leader + directory + END_OF_FIELD
|
73
|
+
|
74
|
+
# determine complete record
|
75
|
+
marc = base + fields + END_OF_RECORD
|
76
|
+
|
77
|
+
# update leader with the byte offest to the end of the directory
|
78
|
+
marc[12..16] = format_byte_count(base.respond_to?(:bytesize) ?
|
79
|
+
base.bytesize() :
|
80
|
+
base.length()
|
81
|
+
)
|
82
|
+
|
83
|
+
# update the record length
|
84
|
+
marc[0..4] = format_byte_count(marc.respond_to?(:bytesize) ?
|
85
|
+
marc.bytesize() :
|
86
|
+
marc.length()
|
87
|
+
)
|
88
|
+
|
89
|
+
# store updated leader in the record that was passed in
|
90
|
+
record.leader = marc[0..LEADER_LENGTH-1]
|
91
|
+
|
92
|
+
# return encoded marc
|
93
|
+
return marc
|
94
|
+
end
|
95
|
+
|
96
|
+
def self.format_byte_count(number, num_digits=5)
|
97
|
+
formatted = sprintf("%0#{num_digits}i", number)
|
98
|
+
if formatted.length > num_digits
|
99
|
+
# uh, oh, we've exceeded our max. Either zero out
|
100
|
+
# or raise, depending on settings.
|
101
|
+
#formatted = sprintf("%0#{num_digits}i", "")
|
102
|
+
formatted = "9" * num_digits
|
103
|
+
end
|
104
|
+
return formatted
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
data/lib/marc/writer.rb
CHANGED
@@ -1,8 +1,30 @@
|
|
1
1
|
module MARC
|
2
2
|
|
3
|
-
# A class for writing MARC records as
|
4
|
-
|
3
|
+
# A class for writing MARC records as binary MARC (ISO 2709)
|
4
|
+
#
|
5
|
+
# == Too-long records
|
6
|
+
#
|
7
|
+
# The MARC binary format only allows records that are total 99999 bytes long,
|
8
|
+
# due to size of a length field in the record.
|
9
|
+
#
|
10
|
+
# By default, the Writer will raise a MARC::Exception when encountering
|
11
|
+
# in-memory records that are too long to be legally written out as ISO 2709
|
12
|
+
# binary.
|
13
|
+
|
14
|
+
# However, if you set `allow_oversized` to true, then the Writer will
|
15
|
+
# write these records out anyway, filling in any binary length/offset slots
|
16
|
+
# with all 0's, if they are not wide enough to hold the true value.
|
17
|
+
# While these records are illegal, they can still be read back in using
|
18
|
+
# the MARC::ForgivingReader, as well as other platform MARC readers
|
19
|
+
# in tolerant mode.
|
20
|
+
#
|
21
|
+
# If you set `allow_oversized` to false on the Writer, a MARC::Exception
|
22
|
+
# will be raised instead, if you try to write an oversized record.
|
23
|
+
#
|
24
|
+
# writer = Writer.new(some_path)
|
25
|
+
# writer.allow_oversized = true
|
5
26
|
class Writer
|
27
|
+
attr_accessor :allow_oversized
|
6
28
|
|
7
29
|
# the constructor which you must pass a file path
|
8
30
|
# or an object that responds to a write message
|
@@ -15,13 +37,14 @@ module MARC
|
|
15
37
|
else
|
16
38
|
throw "must pass in file name or handle"
|
17
39
|
end
|
40
|
+
self.allow_oversized = false
|
18
41
|
end
|
19
42
|
|
20
43
|
|
21
44
|
# write a record to the file or handle
|
22
45
|
|
23
46
|
def write(record)
|
24
|
-
@fh.write(MARC::Writer.encode(record))
|
47
|
+
@fh.write(MARC::Writer.encode(record, self.allow_oversized))
|
25
48
|
end
|
26
49
|
|
27
50
|
|
@@ -34,8 +57,10 @@ module MARC
|
|
34
57
|
|
35
58
|
# a static method that accepts a MARC::Record object
|
36
59
|
# and returns the record encoded as MARC21 in transmission format
|
37
|
-
|
38
|
-
|
60
|
+
#
|
61
|
+
# Second arg allow_oversized, default false, set to true
|
62
|
+
# to raise on MARC record that can't fit into ISO 2709.
|
63
|
+
def self.encode(record, allow_oversized = false)
|
39
64
|
directory = ''
|
40
65
|
fields = ''
|
41
66
|
offset = 0
|
@@ -58,8 +83,8 @@ module MARC
|
|
58
83
|
field_length = (field_data.respond_to?(:bytesize) ?
|
59
84
|
field_data.bytesize() :
|
60
85
|
field_data.length())
|
61
|
-
directory += sprintf("%03s
|
62
|
-
|
86
|
+
directory += sprintf("%03s", field.tag) + format_byte_count(field_length, allow_oversized, 4) + format_byte_count(offset, allow_oversized)
|
87
|
+
|
63
88
|
|
64
89
|
# add field to data for other fields
|
65
90
|
fields += field_data
|
@@ -75,22 +100,42 @@ module MARC
|
|
75
100
|
marc = base + fields + END_OF_RECORD
|
76
101
|
|
77
102
|
# update leader with the byte offest to the end of the directory
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
)
|
103
|
+
bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length()
|
104
|
+
marc[12..16] = format_byte_count(bytesize, allow_oversized)
|
105
|
+
|
82
106
|
|
83
107
|
# update the record length
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
)
|
88
|
-
|
108
|
+
bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length()
|
109
|
+
marc[0..4] = format_byte_count(bytesize, allow_oversized)
|
110
|
+
|
89
111
|
# store updated leader in the record that was passed in
|
90
112
|
record.leader = marc[0..LEADER_LENGTH-1]
|
91
113
|
|
92
114
|
# return encoded marc
|
93
|
-
return marc
|
115
|
+
return marc
|
94
116
|
end
|
117
|
+
|
118
|
+
# Formats numbers for insertion into marc binary slots.
|
119
|
+
# These slots only allow so many digits (and need to be left-padded
|
120
|
+
# with spaces to that number of digits). If the number
|
121
|
+
# is too big, either an exception will be raised, or
|
122
|
+
# we'll return all 0's to proper number of digits.
|
123
|
+
#
|
124
|
+
# first arg is number, second is boolean whether to allow oversized,
|
125
|
+
# third is max digits (default 5)
|
126
|
+
def self.format_byte_count(number, allow_oversized, num_digits=5)
|
127
|
+
formatted = sprintf("%0#{num_digits}i", number)
|
128
|
+
if formatted.length > num_digits
|
129
|
+
# uh, oh, we've exceeded our max. Either zero out
|
130
|
+
# or raise, depending on settings.
|
131
|
+
if allow_oversized
|
132
|
+
formatted = sprintf("%0#{num_digits}i", 0)
|
133
|
+
else
|
134
|
+
raise MARC::Exception.new("Can't write MARC record, as length/offset value of #{number} is too long for the #{num_digits} slot in binary format.")
|
135
|
+
end
|
136
|
+
end
|
137
|
+
return formatted
|
138
|
+
end
|
139
|
+
|
95
140
|
end
|
96
141
|
end
|
data/test/tc_writer.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'marc'
|
3
3
|
|
4
|
+
require 'stringio'
|
5
|
+
|
4
6
|
class WriterTest < Test::Unit::TestCase
|
5
7
|
|
6
8
|
def test_writer
|
@@ -19,6 +21,99 @@ class WriterTest < Test::Unit::TestCase
|
|
19
21
|
# cleanup
|
20
22
|
File.unlink('test/writer.dat')
|
21
23
|
end
|
24
|
+
|
25
|
+
# Only in ruby 1.9
|
26
|
+
if "".respond_to?(:encoding)
|
27
|
+
def test_writer_bad_encoding
|
28
|
+
writer = MARC::Writer.new('test/writer.dat')
|
29
|
+
|
30
|
+
|
31
|
+
# MARC::Writer should just happily write out whatever bytes you give it, even
|
32
|
+
# mixing encodings that can't be mixed. We ran into an actual example mixing
|
33
|
+
# MARC8 (tagged ruby binary) and UTF8, we want it to be written out.
|
34
|
+
|
35
|
+
record = MARC::Record.new
|
36
|
+
|
37
|
+
record.append MARC::DataField.new('700', '0', ' ', ['a', "Nhouy Abhay,".force_encoding("BINARY")], ["c", "Th\xE5ao,".force_encoding("BINARY")], ["d", "1909-"])
|
38
|
+
record.append MARC::DataField.new('700', '0', ' ', ['a', "Somchin P\xF8\xE5o. Ngin,".force_encoding("BINARY")])
|
39
|
+
|
40
|
+
record.append MARC::DataField.new('100', '0', '0', ['a', "\xE5angkham. ".force_encoding("BINARY")])
|
41
|
+
record.append MARC::DataField.new('245', '1', '0', ['b', "chef-d'oeuvre de la litt\xE2erature lao".force_encoding("BINARY")])
|
42
|
+
|
43
|
+
# One in UTF8 and marked
|
44
|
+
record.append MARC::DataField.new('999', '0', '1', ['a', "chef-d'ocuvre de la littU+FFC3\U+FFA9rature".force_encoding("UTF-8")])
|
45
|
+
|
46
|
+
writer.write(record)
|
47
|
+
writer.close
|
48
|
+
|
49
|
+
ensure
|
50
|
+
File.unlink('test/writer.dat')
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_write_too_long_iso2709
|
55
|
+
too_long_record = MARC::Record.new
|
56
|
+
1.upto(1001) do
|
57
|
+
too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
|
58
|
+
end
|
59
|
+
|
60
|
+
wbuffer = StringIO.new("", "w")
|
61
|
+
writer = MARC::Writer.new(wbuffer)
|
62
|
+
writer.allow_oversized = true
|
63
|
+
|
64
|
+
writer.write(too_long_record)
|
65
|
+
writer.close
|
66
|
+
|
67
|
+
assert_equal "00000", wbuffer.string.slice(0, 5), "zero'd out length bytes when too long"
|
68
|
+
|
69
|
+
rbuffer = StringIO.new(wbuffer.string.dup)
|
70
|
+
|
71
|
+
# Regular reader won't read our illegal record.
|
72
|
+
#assert_raise(NoMethodError) do
|
73
|
+
# reader = MARC::Reader.new(rbuffer)
|
74
|
+
# reader.first
|
75
|
+
#end
|
76
|
+
|
77
|
+
# Forgiving reader will, round trippable
|
78
|
+
new_record = MARC::Reader.decode(rbuffer.string, :forgiving => true)
|
79
|
+
assert_equal too_long_record, new_record, "Too long record round-trippable with forgiving mode"
|
80
|
+
|
81
|
+
# Test in the middle of a MARC file
|
82
|
+
good_record = MARC::Record.new
|
83
|
+
good_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A short record'])
|
84
|
+
wbuffer = StringIO.new("", "w")
|
85
|
+
writer = MARC::Writer.new(wbuffer)
|
86
|
+
writer.allow_oversized = true
|
87
|
+
|
88
|
+
writer.write(good_record)
|
89
|
+
writer.write(too_long_record)
|
90
|
+
writer.write(good_record)
|
91
|
+
|
92
|
+
rbuffer = StringIO.new(wbuffer.string.dup)
|
93
|
+
reader = MARC::ForgivingReader.new(rbuffer)
|
94
|
+
records = reader.to_a
|
95
|
+
|
96
|
+
assert_equal 3, records.length
|
97
|
+
assert_equal good_record, records[0]
|
98
|
+
assert_equal good_record, records[2]
|
99
|
+
assert_equal too_long_record, records[1]
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_raises_on_too_long_if_configured
|
103
|
+
too_long_record = MARC::Record.new
|
104
|
+
1.upto(1001) do
|
105
|
+
too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
|
106
|
+
end
|
107
|
+
|
108
|
+
wbuffer = StringIO.new("", "w")
|
109
|
+
writer = MARC::Writer.new(wbuffer)
|
110
|
+
|
111
|
+
assert_raise(MARC::Exception) do
|
112
|
+
writer.write too_long_record
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
|
22
117
|
|
23
118
|
def test_forgiving_writer
|
24
119
|
marc = "00305cam a2200133 a 4500001000700000003000900007005001700016008004100033008004100074035002500115245001700140909001000157909000400167\036635145\036UK-BiLMS\03620060329173705.0\036s1982iieng6 000 0 eng||\036060116|||||||||xxk eng||\036 \037a(UK-BiLMS)M0017366ZW\03600\037aTest record.\036 \037aa\037b\037c\036\037b0\036\035\000"
|
@@ -29,18 +124,18 @@ class WriterTest < Test::Unit::TestCase
|
|
29
124
|
end
|
30
125
|
|
31
126
|
def test_unicode_roundtrip
|
32
|
-
record = MARC::Reader.new('test/utf8.marc').first
|
127
|
+
record = MARC::Reader.new('test/utf8.marc', :external_encoding => "UTF-8").first
|
33
128
|
|
34
129
|
writer = MARC::Writer.new('test/writer.dat')
|
35
130
|
writer.write(record)
|
36
131
|
writer.close
|
37
132
|
|
38
|
-
read_back_record = MARC::Reader.new('test/writer.dat').first
|
133
|
+
read_back_record = MARC::Reader.new('test/writer.dat', :external_encoding => "UTF-8").first
|
39
134
|
|
40
135
|
# Make sure the one we wrote out then read in again
|
41
136
|
# is the same as the one we read the first time
|
42
137
|
# Looks like "==" is over-ridden to do that. Don't ever change, #==
|
43
|
-
|
138
|
+
assert_equal record, read_back_record, "Round-tripped record must equal original record"
|
44
139
|
end
|
45
140
|
|
46
141
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: marc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ authors:
|
|
13
13
|
autorequire: marc
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
|
-
date: 2013-
|
16
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
17
17
|
dependencies: []
|
18
18
|
description:
|
19
19
|
email: ehs@pobox.com
|
@@ -30,6 +30,7 @@ files:
|
|
30
30
|
- lib/marc/record.rb
|
31
31
|
- lib/marc/subfield.rb
|
32
32
|
- lib/marc/version.rb
|
33
|
+
- lib/marc/writer-NEW.rb
|
33
34
|
- lib/marc/writer.rb
|
34
35
|
- lib/marc/xml_parsers.rb
|
35
36
|
- lib/marc/xmlreader.rb
|