marc 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,3 +1,4 @@
1
+ [![Gem Version](https://badge.fury.io/rb/marc.png)](http://badge.fury.io/rb/marc)
1
2
  [![Build Status](https://secure.travis-ci.org/ruby-marc/ruby-marc.png)](http://travis-ci.org/ruby-marc/ruby-marc)
2
3
 
3
4
  marc is a ruby library for reading and writing MAchine Readable Cataloging
data/lib/marc/reader.rb CHANGED
@@ -396,7 +396,7 @@ module MARC
396
396
  # Like Reader ForgivingReader lets you read in a batch of MARC21 records
397
397
  # but it does not use record lengths and field byte offets found in the
398
398
  # leader and directory. It is not unusual to run across MARC records
399
- # which have had their offsets calcualted wrong. In situations like this
399
+ # which have had their offsets calculated wrong. In situations like this
400
400
  # the vanilla Reader may fail, and you can try to use ForgivingReader.
401
401
  #
402
402
  # The one downside to this is that ForgivingReader will assume that the
data/lib/marc/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module MARC
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
@@ -0,0 +1,108 @@
1
+ module MARC
2
+
3
+ # A class for writing MARC records as MARC21.
4
+
5
+ class Writer
6
+
7
+ # the constructor which you must pass a file path
8
+ # or an object that responds to a write message
9
+
10
+ def initialize(file)
11
+ if file.class == String
12
+ @fh = File.new(file,"w")
13
+ elsif file.respond_to?('write')
14
+ @fh = file
15
+ else
16
+ throw "must pass in file name or handle"
17
+ end
18
+ end
19
+
20
+
21
+ # write a record to the file or handle
22
+
23
+ def write(record)
24
+ @fh.write(MARC::Writer.encode(record))
25
+ end
26
+
27
+
28
+ # close underlying filehandle
29
+
30
+ def close
31
+ @fh.close
32
+ end
33
+
34
+
35
+ # a static method that accepts a MARC::Record object
36
+ # and returns the record encoded as MARC21 in transmission format
37
+
38
+ def self.encode(record)
39
+ directory = ''
40
+ fields = ''
41
+ offset = 0
42
+ for field in record.fields
43
+
44
+ # encode the field
45
+ field_data = ''
46
+ if field.class == MARC::DataField
47
+ warn("Warn: Missing indicator") unless field.indicator1 && field.indicator2
48
+ field_data = (field.indicator1 || " ") + (field.indicator2 || " ")
49
+ for s in field.subfields
50
+ field_data += SUBFIELD_INDICATOR + s.code + s.value
51
+ end
52
+ elsif field.class == MARC::ControlField
53
+ field_data = field.value
54
+ end
55
+ field_data += END_OF_FIELD
56
+
57
+ # calculate directory entry for the field
58
+ field_length = (field_data.respond_to?(:bytesize) ?
59
+ field_data.bytesize() :
60
+ field_data.length())
61
+ directory += sprintf("%03s", field.tag) + format_byte_count(field_length, 4) + format_byte_count(offset)
62
+
63
+
64
+ # add field to data for other fields
65
+ fields += field_data
66
+
67
+ # update offset for next field
68
+ offset += field_length
69
+ end
70
+
71
+ # determine the base (leader + directory)
72
+ base = record.leader + directory + END_OF_FIELD
73
+
74
+ # determine complete record
75
+ marc = base + fields + END_OF_RECORD
76
+
77
+ # update leader with the byte offest to the end of the directory
78
+ marc[12..16] = format_byte_count(base.respond_to?(:bytesize) ?
79
+ base.bytesize() :
80
+ base.length()
81
+ )
82
+
83
+ # update the record length
84
+ marc[0..4] = format_byte_count(marc.respond_to?(:bytesize) ?
85
+ marc.bytesize() :
86
+ marc.length()
87
+ )
88
+
89
+ # store updated leader in the record that was passed in
90
+ record.leader = marc[0..LEADER_LENGTH-1]
91
+
92
+ # return encoded marc
93
+ return marc
94
+ end
95
+
96
+ def self.format_byte_count(number, num_digits=5)
97
+ formatted = sprintf("%0#{num_digits}i", number)
98
+ if formatted.length > num_digits
99
+ # uh, oh, we've exceeded our max. Either zero out
100
+ # or raise, depending on settings.
101
+ #formatted = sprintf("%0#{num_digits}i", "")
102
+ formatted = "9" * num_digits
103
+ end
104
+ return formatted
105
+ end
106
+
107
+ end
108
+ end
data/lib/marc/writer.rb CHANGED
@@ -1,8 +1,30 @@
1
1
  module MARC
2
2
 
3
- # A class for writing MARC records as MARC21.
4
-
3
+ # A class for writing MARC records as binary MARC (ISO 2709)
4
+ #
5
+ # == Too-long records
6
+ #
7
+ # The MARC binary format only allows records that are total 99999 bytes long,
8
+ # due to size of a length field in the record.
9
+ #
10
+ # By default, the Writer will raise a MARC::Exception when encountering
11
+ # in-memory records that are too long to be legally written out as ISO 2709
12
+ # binary.
13
+
14
+ # However, if you set `allow_oversized` to true, then the Writer will
15
+ # write these records out anyway, filling in any binary length/offset slots
16
+ # with all 0's, if they are not wide enough to hold the true value.
17
+ # While these records are illegal, they can still be read back in using
18
+ # the MARC::ForgivingReader, as well as other platform MARC readers
19
+ # in tolerant mode.
20
+ #
21
+ # If you set `allow_oversized` to false on the Writer, a MARC::Exception
22
+ # will be raised instead, if you try to write an oversized record.
23
+ #
24
+ # writer = Writer.new(some_path)
25
+ # writer.allow_oversized = true
5
26
  class Writer
27
+ attr_accessor :allow_oversized
6
28
 
7
29
  # the constructor which you must pass a file path
8
30
  # or an object that responds to a write message
@@ -15,13 +37,14 @@ module MARC
15
37
  else
16
38
  throw "must pass in file name or handle"
17
39
  end
40
+ self.allow_oversized = false
18
41
  end
19
42
 
20
43
 
21
44
  # write a record to the file or handle
22
45
 
23
46
  def write(record)
24
- @fh.write(MARC::Writer.encode(record))
47
+ @fh.write(MARC::Writer.encode(record, self.allow_oversized))
25
48
  end
26
49
 
27
50
 
@@ -34,8 +57,10 @@ module MARC
34
57
 
35
58
  # a static method that accepts a MARC::Record object
36
59
  # and returns the record encoded as MARC21 in transmission format
37
-
38
- def self.encode(record)
60
+ #
61
+ # Second arg allow_oversized, default false, set to true
62
+ # to raise on MARC record that can't fit into ISO 2709.
63
+ def self.encode(record, allow_oversized = false)
39
64
  directory = ''
40
65
  fields = ''
41
66
  offset = 0
@@ -58,8 +83,8 @@ module MARC
58
83
  field_length = (field_data.respond_to?(:bytesize) ?
59
84
  field_data.bytesize() :
60
85
  field_data.length())
61
- directory += sprintf("%03s%04i%05i", field.tag, field_length,
62
- offset)
86
+ directory += sprintf("%03s", field.tag) + format_byte_count(field_length, allow_oversized, 4) + format_byte_count(offset, allow_oversized)
87
+
63
88
 
64
89
  # add field to data for other fields
65
90
  fields += field_data
@@ -75,22 +100,42 @@ module MARC
75
100
  marc = base + fields + END_OF_RECORD
76
101
 
77
102
  # update leader with the byte offest to the end of the directory
78
- marc[12..16] = sprintf("%05i", (base.respond_to?(:bytesize) ?
79
- base.bytesize() :
80
- base.length() )
81
- )
103
+ bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length()
104
+ marc[12..16] = format_byte_count(bytesize, allow_oversized)
105
+
82
106
 
83
107
  # update the record length
84
- marc[0..4] = sprintf("%05i", (marc.respond_to?(:bytesize) ?
85
- marc.bytesize() :
86
- marc.length())
87
- )
88
-
108
+ bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length()
109
+ marc[0..4] = format_byte_count(bytesize, allow_oversized)
110
+
89
111
  # store updated leader in the record that was passed in
90
112
  record.leader = marc[0..LEADER_LENGTH-1]
91
113
 
92
114
  # return encoded marc
93
- return marc
115
+ return marc
94
116
  end
117
+
118
+ # Formats numbers for insertion into marc binary slots.
119
+ # These slots only allow so many digits (and need to be left-padded
120
+ # with spaces to that number of digits). If the number
121
+ # is too big, either an exception will be raised, or
122
+ # we'll return all 0's to proper number of digits.
123
+ #
124
+ # first arg is number, second is boolean whether to allow oversized,
125
+ # third is max digits (default 5)
126
+ def self.format_byte_count(number, allow_oversized, num_digits=5)
127
+ formatted = sprintf("%0#{num_digits}i", number)
128
+ if formatted.length > num_digits
129
+ # uh, oh, we've exceeded our max. Either zero out
130
+ # or raise, depending on settings.
131
+ if allow_oversized
132
+ formatted = sprintf("%0#{num_digits}i", 0)
133
+ else
134
+ raise MARC::Exception.new("Can't write MARC record, as length/offset value of #{number} is too long for the #{num_digits} slot in binary format.")
135
+ end
136
+ end
137
+ return formatted
138
+ end
139
+
95
140
  end
96
141
  end
@@ -3,6 +3,8 @@
3
3
  require 'test/unit'
4
4
  require 'marc'
5
5
 
6
+ require 'stringio'
7
+
6
8
  # Testing char encodings under 1.9, don't bother running
7
9
  # these tests except under 1.9, will either fail (because
8
10
  # 1.9 func the test itself uses isn't there), or trivially pass
data/test/tc_writer.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require 'test/unit'
2
2
  require 'marc'
3
3
 
4
+ require 'stringio'
5
+
4
6
  class WriterTest < Test::Unit::TestCase
5
7
 
6
8
  def test_writer
@@ -19,6 +21,99 @@ class WriterTest < Test::Unit::TestCase
19
21
  # cleanup
20
22
  File.unlink('test/writer.dat')
21
23
  end
24
+
25
+ # Only in ruby 1.9
26
+ if "".respond_to?(:encoding)
27
+ def test_writer_bad_encoding
28
+ writer = MARC::Writer.new('test/writer.dat')
29
+
30
+
31
+ # MARC::Writer should just happily write out whatever bytes you give it, even
32
+ # mixing encodings that can't be mixed. We ran into an actual example mixing
33
+ # MARC8 (tagged ruby binary) and UTF8, we want it to be written out.
34
+
35
+ record = MARC::Record.new
36
+
37
+ record.append MARC::DataField.new('700', '0', ' ', ['a', "Nhouy Abhay,".force_encoding("BINARY")], ["c", "Th\xE5ao,".force_encoding("BINARY")], ["d", "1909-"])
38
+ record.append MARC::DataField.new('700', '0', ' ', ['a', "Somchin P\xF8\xE5o. Ngin,".force_encoding("BINARY")])
39
+
40
+ record.append MARC::DataField.new('100', '0', '0', ['a', "\xE5angkham. ".force_encoding("BINARY")])
41
+ record.append MARC::DataField.new('245', '1', '0', ['b', "chef-d'oeuvre de la litt\xE2erature lao".force_encoding("BINARY")])
42
+
43
+ # One in UTF8 and marked
44
+ record.append MARC::DataField.new('999', '0', '1', ['a', "chef-d'ocuvre de la littU+FFC3\U+FFA9rature".force_encoding("UTF-8")])
45
+
46
+ writer.write(record)
47
+ writer.close
48
+
49
+ ensure
50
+ File.unlink('test/writer.dat')
51
+ end
52
+ end
53
+
54
+ def test_write_too_long_iso2709
55
+ too_long_record = MARC::Record.new
56
+ 1.upto(1001) do
57
+ too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
58
+ end
59
+
60
+ wbuffer = StringIO.new("", "w")
61
+ writer = MARC::Writer.new(wbuffer)
62
+ writer.allow_oversized = true
63
+
64
+ writer.write(too_long_record)
65
+ writer.close
66
+
67
+ assert_equal "00000", wbuffer.string.slice(0, 5), "zero'd out length bytes when too long"
68
+
69
+ rbuffer = StringIO.new(wbuffer.string.dup)
70
+
71
+ # Regular reader won't read our illegal record.
72
+ #assert_raise(NoMethodError) do
73
+ # reader = MARC::Reader.new(rbuffer)
74
+ # reader.first
75
+ #end
76
+
77
+ # Forgiving reader will, round trippable
78
+ new_record = MARC::Reader.decode(rbuffer.string, :forgiving => true)
79
+ assert_equal too_long_record, new_record, "Too long record round-trippable with forgiving mode"
80
+
81
+ # Test in the middle of a MARC file
82
+ good_record = MARC::Record.new
83
+ good_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A short record'])
84
+ wbuffer = StringIO.new("", "w")
85
+ writer = MARC::Writer.new(wbuffer)
86
+ writer.allow_oversized = true
87
+
88
+ writer.write(good_record)
89
+ writer.write(too_long_record)
90
+ writer.write(good_record)
91
+
92
+ rbuffer = StringIO.new(wbuffer.string.dup)
93
+ reader = MARC::ForgivingReader.new(rbuffer)
94
+ records = reader.to_a
95
+
96
+ assert_equal 3, records.length
97
+ assert_equal good_record, records[0]
98
+ assert_equal good_record, records[2]
99
+ assert_equal too_long_record, records[1]
100
+ end
101
+
102
+ def test_raises_on_too_long_if_configured
103
+ too_long_record = MARC::Record.new
104
+ 1.upto(1001) do
105
+ too_long_record.append MARC::DataField.new("500", ' ', ' ', ['a', 'A really long record.1234567890123456789012345678901234567890123456789012345678901234567890123456789'])
106
+ end
107
+
108
+ wbuffer = StringIO.new("", "w")
109
+ writer = MARC::Writer.new(wbuffer)
110
+
111
+ assert_raise(MARC::Exception) do
112
+ writer.write too_long_record
113
+ end
114
+
115
+ end
116
+
22
117
 
23
118
  def test_forgiving_writer
24
119
  marc = "00305cam a2200133 a 4500001000700000003000900007005001700016008004100033008004100074035002500115245001700140909001000157909000400167\036635145\036UK-BiLMS\03620060329173705.0\036s1982iieng6 000 0 eng||\036060116|||||||||xxk eng||\036 \037a(UK-BiLMS)M0017366ZW\03600\037aTest record.\036 \037aa\037b\037c\036\037b0\036\035\000"
@@ -29,18 +124,18 @@ class WriterTest < Test::Unit::TestCase
29
124
  end
30
125
 
31
126
  def test_unicode_roundtrip
32
- record = MARC::Reader.new('test/utf8.marc').first
127
+ record = MARC::Reader.new('test/utf8.marc', :external_encoding => "UTF-8").first
33
128
 
34
129
  writer = MARC::Writer.new('test/writer.dat')
35
130
  writer.write(record)
36
131
  writer.close
37
132
 
38
- read_back_record = MARC::Reader.new('test/writer.dat').first
133
+ read_back_record = MARC::Reader.new('test/writer.dat', :external_encoding => "UTF-8").first
39
134
 
40
135
  # Make sure the one we wrote out then read in again
41
136
  # is the same as the one we read the first time
42
137
  # Looks like "==" is over-ridden to do that. Don't ever change, #==
43
- assert (record == read_back_record), "Round-tripped record must equal original record"
138
+ assert_equal record, read_back_record, "Round-tripped record must equal original record"
44
139
  end
45
140
 
46
141
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: marc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ authors:
13
13
  autorequire: marc
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2013-08-19 00:00:00.000000000 Z
16
+ date: 2013-09-03 00:00:00.000000000 Z
17
17
  dependencies: []
18
18
  description:
19
19
  email: ehs@pobox.com
@@ -30,6 +30,7 @@ files:
30
30
  - lib/marc/record.rb
31
31
  - lib/marc/subfield.rb
32
32
  - lib/marc/version.rb
33
+ - lib/marc/writer-NEW.rb
33
34
  - lib/marc/writer.rb
34
35
  - lib/marc/xml_parsers.rb
35
36
  - lib/marc/xmlreader.rb