marc 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/marc/reader.rb +4 -1
- data/lib/marc/record.rb +1 -1
- data/lib/marc/xmlwriter.rb +96 -17
- data/test/{tc_xmlreader.rb → tc_xml.rb} +7 -4
- data/test/ts_marc.rb +1 -2
- metadata +33 -40
- data/test/foo.xml +0 -1
- data/test/tc_xmlwriter.rb +0 -37
data/lib/marc/reader.rb
CHANGED
@@ -12,7 +12,10 @@ module MARC
|
|
12
12
|
# fh = File.new('marc.dat')
|
13
13
|
# reader = MARC::Reader.new(fh)
|
14
14
|
#
|
15
|
-
# or really any object that responds to read(n)
|
15
|
+
# or really any object that responds to read(n)
|
16
|
+
#
|
17
|
+
# # marc is a string with a bunch of records in it
|
18
|
+
# reader = MARC::Reader.new(StringIO.new(reader))
|
16
19
|
|
17
20
|
def initialize(file)
|
18
21
|
if file.class == String:
|
data/lib/marc/record.rb
CHANGED
data/lib/marc/xmlwriter.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rexml/document'
|
2
|
+
require 'rexml/text'
|
2
3
|
|
3
4
|
module MARC
|
4
5
|
|
@@ -8,8 +9,13 @@ module MARC
|
|
8
9
|
|
9
10
|
# the constructor which you must pass a file path
|
10
11
|
# or an object that responds to a write message
|
12
|
+
# the second argument is a hash of options, currently
|
13
|
+
# only supporting one option, stylesheet
|
14
|
+
#
|
15
|
+
# writer = XMLWriter.new 'marc.xml', :stylesheet => 'style.xsl'
|
16
|
+
# writer.write record
|
11
17
|
|
12
|
-
def initialize(file)
|
18
|
+
def initialize(file, opts={})
|
13
19
|
if file.class == String
|
14
20
|
@fh = File.new(file,"w")
|
15
21
|
elsif file.respond_to?('write')
|
@@ -19,17 +25,22 @@ module MARC
|
|
19
25
|
end
|
20
26
|
|
21
27
|
@fh.write("<?xml version='1.0'?>")
|
22
|
-
|
28
|
+
if opts[:stylesheet]
|
29
|
+
@fh.write(
|
30
|
+
%Q{<?xml-stylesheet type="text/xsl" href="#{opts[:stylesheet]}"?>})
|
31
|
+
end
|
23
32
|
@fh.write("<collection xmlns='" + MARC_NS + "' " +
|
24
33
|
"xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " +
|
25
34
|
"xsi:schemaLocation='" + MARC_NS + " " + MARC_XSD + "'>")
|
35
|
+
@fh.write("\n")
|
26
36
|
end
|
27
37
|
|
28
38
|
|
29
39
|
# write a record to the file or handle
|
30
40
|
|
31
41
|
def write(record)
|
32
|
-
|
42
|
+
MARC::XMLWriter.encode(record).write(@fh, 0)
|
43
|
+
@fh.write("\n");
|
33
44
|
end
|
34
45
|
|
35
46
|
|
@@ -40,24 +51,77 @@ module MARC
|
|
40
51
|
@fh.close
|
41
52
|
end
|
42
53
|
|
54
|
+
|
55
|
+
# Converts from ISO 8859-1 to UTF-8, normalizes the UTF-8, and puts a
|
56
|
+
# 'clean up marker' in records that have control characters (which are
|
57
|
+
# not valid in XML). This is useful for locating these records once
|
58
|
+
# they are in XML so problems caused by removing the invalid characters
|
59
|
+
# can be fixed by a person. This (or something in the module) needs to
|
60
|
+
# convert from MARC-8 to UTF-8, but it doesn't do this yet...
|
61
|
+
|
62
|
+
def self.convert_to_utf8(text)
|
63
|
+
cleaned_text = text.gsub(/[\x00-\x1f\x7f-\xff]+/, ' CLEAN_ME_UP ')
|
64
|
+
utf8_text = cleaned_text.unpack('C*').pack('U*')
|
65
|
+
normalized_text = REXML::Text::normalize(utf8_text)
|
66
|
+
|
67
|
+
return normalized_text
|
68
|
+
end
|
43
69
|
|
44
70
|
# a static method that accepts a MARC::Record object
|
45
|
-
# and returns a REXML::Document for the XML serialization
|
71
|
+
# and returns a REXML::Document for the XML serialization.
|
46
72
|
|
47
73
|
def self.encode(record)
|
74
|
+
singleChar = Regexp.new(/[\da-z ]{1}/)
|
75
|
+
ctrlFieldTag = Regexp.new(/00[1-9A-Za-z]{1}s/)
|
76
|
+
|
77
|
+
# Right now, this writer handles input from the strict and
|
78
|
+
# lenient MARC readers. Because it can get 'loose' MARC in, it
|
79
|
+
# attempts to do some cleanup on data values that are not valid
|
80
|
+
# MARCXML.
|
81
|
+
|
82
|
+
# TODO? Perhaps the 'loose MARC' checks should be split out
|
83
|
+
# into a tolerant MARCXMLWriter allowing the main one to skip
|
84
|
+
# this extra work.
|
85
|
+
|
86
|
+
# TODO: At the very least there should be some logging
|
87
|
+
# to record our attempts to account for less than perfect MARC.
|
88
|
+
|
48
89
|
root = "<record/>"
|
49
|
-
doc = REXML::Document.new
|
90
|
+
doc = REXML::Document.new(root)
|
91
|
+
|
92
|
+
# MARCXML only allows alphanumerics or spaces in the leader
|
93
|
+
record.leader.gsub!(/[^\w|^\s]/, 'Z')
|
94
|
+
|
95
|
+
# MARCXML is particular about last four characters; ILSes aren't
|
96
|
+
if (record.leader[20..23] != "4500")
|
97
|
+
record.leader[20..23] = "4500"
|
98
|
+
end
|
50
99
|
|
51
|
-
# MARCXML
|
52
|
-
record.leader[
|
100
|
+
# MARCXML doesn't like a space here so we need a filler character: Z
|
101
|
+
if (record.leader[6..6] == " ")
|
102
|
+
record.leader[6..6] = "Z"
|
103
|
+
end
|
53
104
|
|
54
|
-
leader = REXML::Element.new
|
55
|
-
leader.add_text
|
56
|
-
doc.root.add_element
|
105
|
+
leader = REXML::Element.new("leader")
|
106
|
+
leader.add_text(record.leader)
|
107
|
+
doc.root.add_element(leader)
|
57
108
|
|
58
109
|
for field in record.fields
|
59
110
|
if field.class == MARC::DataField
|
60
|
-
datafield_elem = REXML::Element.new
|
111
|
+
datafield_elem = REXML::Element.new("datafield")
|
112
|
+
|
113
|
+
# If marc is leniently parsed, we may have some dirty data; using
|
114
|
+
# the 'z' ind1 value should help us locate these later to fix
|
115
|
+
if (field.indicator1.match(singleChar) == nil)
|
116
|
+
field.indicator1 = 'z'
|
117
|
+
end
|
118
|
+
|
119
|
+
# If marc is leniently parsed, we may have some dirty data; using
|
120
|
+
# the 'z' ind2 value should help us locate these later to fix
|
121
|
+
if (field.indicator2.match(singleChar) == nil)
|
122
|
+
field.indicator2 = 'z'
|
123
|
+
end
|
124
|
+
|
61
125
|
datafield_elem.add_attributes({
|
62
126
|
"tag"=>field.tag,
|
63
127
|
"ind1"=>field.indicator1,
|
@@ -65,18 +129,33 @@ module MARC
|
|
65
129
|
})
|
66
130
|
|
67
131
|
for subfield in field.subfields
|
68
|
-
subfield_element = REXML::Element.new
|
132
|
+
subfield_element = REXML::Element.new("subfield")
|
133
|
+
|
134
|
+
# If marc is leniently parsed, we may have some dirty data; using
|
135
|
+
# the blank subfield code should help us locate these later to fix
|
136
|
+
if (subfield.code.match(singleChar) == nil)
|
137
|
+
subfield.code = ' '
|
138
|
+
end
|
139
|
+
|
69
140
|
subfield_element.add_attribute("code", subfield.code)
|
70
|
-
|
71
|
-
|
141
|
+
text = MARC::XMLWriter.convert_to_utf8(subfield.value)
|
142
|
+
subfield_element.add_text(text)
|
143
|
+
datafield_elem.add_element(subfield_element)
|
72
144
|
end
|
73
145
|
|
74
146
|
doc.root.add_element datafield_elem
|
75
147
|
elsif field.class == MARC::ControlField
|
76
|
-
control_element = REXML::Element.new
|
148
|
+
control_element = REXML::Element.new("controlfield")
|
149
|
+
|
150
|
+
# We need a marker for invalid tag values (we use 000)
|
151
|
+
if (field.tag.match(ctrlFieldTag) == nil)
|
152
|
+
field.tag = "00z"
|
153
|
+
end
|
154
|
+
|
77
155
|
control_element.add_attribute("tag", field.tag)
|
78
|
-
|
79
|
-
|
156
|
+
text = MARC::XMLWriter.convert_to_utf8(field.value)
|
157
|
+
control_element.add_text(text)
|
158
|
+
doc.root.add_element(control_element)
|
80
159
|
end
|
81
160
|
end
|
82
161
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'marc'
|
3
3
|
|
4
|
-
class
|
4
|
+
class XMLTest < Test::Unit::TestCase
|
5
5
|
|
6
6
|
def otest_batch
|
7
7
|
reader = MARC::XMLReader.new('test/batch.xml')
|
@@ -20,15 +20,18 @@ class XMLReaderTest < Test::Unit::TestCase
|
|
20
20
|
record1.append MARC::DataField.new('245', '0', '4',
|
21
21
|
['a', 'The Great Ray Charles'], ['h', '[sound recording].'])
|
22
22
|
|
23
|
-
writer = MARC::XMLWriter.new('test/
|
23
|
+
writer = MARC::XMLWriter.new('test/test.xml', :stylesheet => 'style.xsl')
|
24
24
|
writer.write(record1)
|
25
25
|
writer.close
|
26
26
|
|
27
|
-
|
27
|
+
xml = File.read('test/test.xml')
|
28
|
+
assert_match /<\?xml-stylesheet type="text\/xsl" href="style.xsl"\?>/, xml
|
29
|
+
|
30
|
+
reader = MARC::XMLReader.new('test/test.xml')
|
28
31
|
record2 = reader.entries[0]
|
29
32
|
assert_equal(record1, record2)
|
30
33
|
|
31
|
-
|
34
|
+
File.unlink('test/test.xml')
|
32
35
|
end
|
33
36
|
end
|
34
37
|
|
data/test/ts_marc.rb
CHANGED
metadata
CHANGED
@@ -3,11 +3,11 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: marc
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.0.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.0.9
|
7
|
+
date: 2006-03-28 00:00:00 -06:00
|
8
8
|
summary: A ruby library for working with Machine Readable Cataloging
|
9
9
|
require_paths:
|
10
|
-
- lib
|
10
|
+
- lib
|
11
11
|
email: ehs@pobox.com
|
12
12
|
homepage: http://www.textualize.com/ruby_marc
|
13
13
|
rubyforge_project:
|
@@ -18,52 +18,45 @@ bindir: bin
|
|
18
18
|
has_rdoc: true
|
19
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
20
|
requirements:
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
-
|
22
|
+
- ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
24
25
|
version:
|
25
26
|
platform: ruby
|
26
27
|
signing_key:
|
27
28
|
cert_chain:
|
28
29
|
authors:
|
29
|
-
- Ed Summers
|
30
|
+
- Ed Summers
|
30
31
|
files:
|
31
|
-
- lib/marc
|
32
|
-
- lib/marc.rb
|
33
|
-
- lib/marc/
|
34
|
-
- lib/marc/
|
35
|
-
- lib/marc/datafield.rb
|
36
|
-
- lib/marc/
|
37
|
-
- lib/marc/
|
38
|
-
- lib/marc/
|
39
|
-
- lib/marc/
|
40
|
-
- lib/marc/
|
41
|
-
- lib/marc/
|
42
|
-
- lib/marc/
|
43
|
-
- test/batch.dat
|
44
|
-
- test/batch.xml
|
45
|
-
- test/
|
46
|
-
- test/
|
47
|
-
- test/
|
48
|
-
- test/
|
49
|
-
- test/
|
50
|
-
- test/tc_subfield.rb
|
51
|
-
- test/
|
52
|
-
- test/
|
53
|
-
- test/
|
54
|
-
- test/tc_record.rb
|
55
|
-
- test/ts_marc.rb
|
32
|
+
- lib/marc
|
33
|
+
- lib/marc.rb
|
34
|
+
- lib/marc/constants.rb
|
35
|
+
- lib/marc/controlfield.rb
|
36
|
+
- lib/marc/datafield.rb
|
37
|
+
- lib/marc/exception.rb
|
38
|
+
- lib/marc/reader.rb
|
39
|
+
- lib/marc/record.rb
|
40
|
+
- lib/marc/subfield.rb
|
41
|
+
- lib/marc/writer.rb
|
42
|
+
- lib/marc/xmlreader.rb
|
43
|
+
- lib/marc/xmlwriter.rb
|
44
|
+
- test/batch.dat
|
45
|
+
- test/batch.xml
|
46
|
+
- test/one.dat
|
47
|
+
- test/tc_controlfield.rb
|
48
|
+
- test/tc_datafield.rb
|
49
|
+
- test/tc_reader.rb
|
50
|
+
- test/tc_record.rb
|
51
|
+
- test/tc_subfield.rb
|
52
|
+
- test/tc_writer.rb
|
53
|
+
- test/tc_xml.rb
|
54
|
+
- test/ts_marc.rb
|
56
55
|
test_files:
|
57
|
-
- test/ts_marc.rb
|
56
|
+
- test/ts_marc.rb
|
58
57
|
rdoc_options: []
|
59
|
-
|
60
58
|
extra_rdoc_files: []
|
61
|
-
|
62
59
|
executables: []
|
63
|
-
|
64
60
|
extensions: []
|
65
|
-
|
66
61
|
requirements: []
|
67
|
-
|
68
|
-
dependencies: []
|
69
|
-
|
62
|
+
dependencies: []
|
data/test/foo.xml
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
<?xml version='1.0'?><collection xmlns='http://www.loc.gov/MARC21/slim' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'><record><leader>00925njm 22002777a 4500</leader><controlfield tag='007'>sdubumennmplu</controlfield><datafield tag='245' ind1='0' ind2='4'><subfield code='a'>The Great Ray Charles</subfield><subfield code='h'>[sound recording].</subfield></datafield></record></collection>
|
data/test/tc_xmlwriter.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
require 'marc'
|
3
|
-
|
4
|
-
class XMLWriterTest < Test::Unit::TestCase
|
5
|
-
|
6
|
-
def test_writer()
|
7
|
-
# get a record
|
8
|
-
reader = MARC::Reader.new('test/one.dat')
|
9
|
-
record = reader.entries[0]
|
10
|
-
|
11
|
-
str_writer = StringWriter.new()
|
12
|
-
xml_writer = MARC::XMLWriter.new(str_writer)
|
13
|
-
xml_writer.write(record)
|
14
|
-
assert_match /<\?xml version='1.0'\?>/, str_writer.buffer
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
# little class that enables wriing to a string
|
19
|
-
# like it's a file
|
20
|
-
|
21
|
-
class StringWriter
|
22
|
-
attr_reader :buffer
|
23
|
-
|
24
|
-
def initialize
|
25
|
-
@buffer = ''
|
26
|
-
end
|
27
|
-
|
28
|
-
def write(str)
|
29
|
-
@buffer += str
|
30
|
-
end
|
31
|
-
|
32
|
-
def to_s
|
33
|
-
return @buffer
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
|