marc 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/marc/reader.rb +4 -1
- data/lib/marc/record.rb +1 -1
- data/lib/marc/xmlwriter.rb +96 -17
- data/test/{tc_xmlreader.rb → tc_xml.rb} +7 -4
- data/test/ts_marc.rb +1 -2
- metadata +33 -40
- data/test/foo.xml +0 -1
- data/test/tc_xmlwriter.rb +0 -37
data/lib/marc/reader.rb
CHANGED
@@ -12,7 +12,10 @@ module MARC
|
|
12
12
|
# fh = File.new('marc.dat')
|
13
13
|
# reader = MARC::Reader.new(fh)
|
14
14
|
#
|
15
|
-
# or really any object that responds to read(n)
|
15
|
+
# or really any object that responds to read(n)
|
16
|
+
#
|
17
|
+
# # marc is a string with a bunch of records in it
|
18
|
+
# reader = MARC::Reader.new(StringIO.new(reader))
|
16
19
|
|
17
20
|
def initialize(file)
|
18
21
|
if file.class == String:
|
data/lib/marc/record.rb
CHANGED
data/lib/marc/xmlwriter.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rexml/document'
|
2
|
+
require 'rexml/text'
|
2
3
|
|
3
4
|
module MARC
|
4
5
|
|
@@ -8,8 +9,13 @@ module MARC
|
|
8
9
|
|
9
10
|
# the constructor which you must pass a file path
|
10
11
|
# or an object that responds to a write message
|
12
|
+
# the second argument is a hash of options, currently
|
13
|
+
# only supporting one option, stylesheet
|
14
|
+
#
|
15
|
+
# writer = XMLWriter.new 'marc.xml', :stylesheet => 'style.xsl'
|
16
|
+
# writer.write record
|
11
17
|
|
12
|
-
def initialize(file)
|
18
|
+
def initialize(file, opts={})
|
13
19
|
if file.class == String
|
14
20
|
@fh = File.new(file,"w")
|
15
21
|
elsif file.respond_to?('write')
|
@@ -19,17 +25,22 @@ module MARC
|
|
19
25
|
end
|
20
26
|
|
21
27
|
@fh.write("<?xml version='1.0'?>")
|
22
|
-
|
28
|
+
if opts[:stylesheet]
|
29
|
+
@fh.write(
|
30
|
+
%Q{<?xml-stylesheet type="text/xsl" href="#{opts[:stylesheet]}"?>})
|
31
|
+
end
|
23
32
|
@fh.write("<collection xmlns='" + MARC_NS + "' " +
|
24
33
|
"xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " +
|
25
34
|
"xsi:schemaLocation='" + MARC_NS + " " + MARC_XSD + "'>")
|
35
|
+
@fh.write("\n")
|
26
36
|
end
|
27
37
|
|
28
38
|
|
29
39
|
# write a record to the file or handle
|
30
40
|
|
31
41
|
def write(record)
|
32
|
-
|
42
|
+
MARC::XMLWriter.encode(record).write(@fh, 0)
|
43
|
+
@fh.write("\n");
|
33
44
|
end
|
34
45
|
|
35
46
|
|
@@ -40,24 +51,77 @@ module MARC
|
|
40
51
|
@fh.close
|
41
52
|
end
|
42
53
|
|
54
|
+
|
55
|
+
# Converts from ISO 8859-1 to UTF-8, normalizes the UTF-8, and puts a
|
56
|
+
# 'clean up marker' in records that have control characters (which are
|
57
|
+
# not valid in XML). This is useful for locating these records once
|
58
|
+
# they are in XML so problems caused by removing the invalid characters
|
59
|
+
# can be fixed by a person. This (or something in the module) needs to
|
60
|
+
# convert from MARC-8 to UTF-8, but it doesn't do this yet...
|
61
|
+
|
62
|
+
def self.convert_to_utf8(text)
|
63
|
+
cleaned_text = text.gsub(/[\x00-\x1f\x7f-\xff]+/, ' CLEAN_ME_UP ')
|
64
|
+
utf8_text = cleaned_text.unpack('C*').pack('U*')
|
65
|
+
normalized_text = REXML::Text::normalize(utf8_text)
|
66
|
+
|
67
|
+
return normalized_text
|
68
|
+
end
|
43
69
|
|
44
70
|
# a static method that accepts a MARC::Record object
|
45
|
-
# and returns a REXML::Document for the XML serialization
|
71
|
+
# and returns a REXML::Document for the XML serialization.
|
46
72
|
|
47
73
|
def self.encode(record)
|
74
|
+
singleChar = Regexp.new(/[\da-z ]{1}/)
|
75
|
+
ctrlFieldTag = Regexp.new(/00[1-9A-Za-z]{1}s/)
|
76
|
+
|
77
|
+
# Right now, this writer handles input from the strict and
|
78
|
+
# lenient MARC readers. Because it can get 'loose' MARC in, it
|
79
|
+
# attempts to do some cleanup on data values that are not valid
|
80
|
+
# MARCXML.
|
81
|
+
|
82
|
+
# TODO? Perhaps the 'loose MARC' checks should be split out
|
83
|
+
# into a tolerant MARCXMLWriter allowing the main one to skip
|
84
|
+
# this extra work.
|
85
|
+
|
86
|
+
# TODO: At the very least there should be some logging
|
87
|
+
# to record our attempts to account for less than perfect MARC.
|
88
|
+
|
48
89
|
root = "<record/>"
|
49
|
-
doc = REXML::Document.new
|
90
|
+
doc = REXML::Document.new(root)
|
91
|
+
|
92
|
+
# MARCXML only allows alphanumerics or spaces in the leader
|
93
|
+
record.leader.gsub!(/[^\w|^\s]/, 'Z')
|
94
|
+
|
95
|
+
# MARCXML is particular about last four characters; ILSes aren't
|
96
|
+
if (record.leader[20..23] != "4500")
|
97
|
+
record.leader[20..23] = "4500"
|
98
|
+
end
|
50
99
|
|
51
|
-
# MARCXML
|
52
|
-
record.leader[
|
100
|
+
# MARCXML doesn't like a space here so we need a filler character: Z
|
101
|
+
if (record.leader[6..6] == " ")
|
102
|
+
record.leader[6..6] = "Z"
|
103
|
+
end
|
53
104
|
|
54
|
-
leader = REXML::Element.new
|
55
|
-
leader.add_text
|
56
|
-
doc.root.add_element
|
105
|
+
leader = REXML::Element.new("leader")
|
106
|
+
leader.add_text(record.leader)
|
107
|
+
doc.root.add_element(leader)
|
57
108
|
|
58
109
|
for field in record.fields
|
59
110
|
if field.class == MARC::DataField
|
60
|
-
datafield_elem = REXML::Element.new
|
111
|
+
datafield_elem = REXML::Element.new("datafield")
|
112
|
+
|
113
|
+
# If marc is leniently parsed, we may have some dirty data; using
|
114
|
+
# the 'z' ind1 value should help us locate these later to fix
|
115
|
+
if (field.indicator1.match(singleChar) == nil)
|
116
|
+
field.indicator1 = 'z'
|
117
|
+
end
|
118
|
+
|
119
|
+
# If marc is leniently parsed, we may have some dirty data; using
|
120
|
+
# the 'z' ind2 value should help us locate these later to fix
|
121
|
+
if (field.indicator2.match(singleChar) == nil)
|
122
|
+
field.indicator2 = 'z'
|
123
|
+
end
|
124
|
+
|
61
125
|
datafield_elem.add_attributes({
|
62
126
|
"tag"=>field.tag,
|
63
127
|
"ind1"=>field.indicator1,
|
@@ -65,18 +129,33 @@ module MARC
|
|
65
129
|
})
|
66
130
|
|
67
131
|
for subfield in field.subfields
|
68
|
-
subfield_element = REXML::Element.new
|
132
|
+
subfield_element = REXML::Element.new("subfield")
|
133
|
+
|
134
|
+
# If marc is leniently parsed, we may have some dirty data; using
|
135
|
+
# the blank subfield code should help us locate these later to fix
|
136
|
+
if (subfield.code.match(singleChar) == nil)
|
137
|
+
subfield.code = ' '
|
138
|
+
end
|
139
|
+
|
69
140
|
subfield_element.add_attribute("code", subfield.code)
|
70
|
-
|
71
|
-
|
141
|
+
text = MARC::XMLWriter.convert_to_utf8(subfield.value)
|
142
|
+
subfield_element.add_text(text)
|
143
|
+
datafield_elem.add_element(subfield_element)
|
72
144
|
end
|
73
145
|
|
74
146
|
doc.root.add_element datafield_elem
|
75
147
|
elsif field.class == MARC::ControlField
|
76
|
-
control_element = REXML::Element.new
|
148
|
+
control_element = REXML::Element.new("controlfield")
|
149
|
+
|
150
|
+
# We need a marker for invalid tag values (we use 000)
|
151
|
+
if (field.tag.match(ctrlFieldTag) == nil)
|
152
|
+
field.tag = "00z"
|
153
|
+
end
|
154
|
+
|
77
155
|
control_element.add_attribute("tag", field.tag)
|
78
|
-
|
79
|
-
|
156
|
+
text = MARC::XMLWriter.convert_to_utf8(field.value)
|
157
|
+
control_element.add_text(text)
|
158
|
+
doc.root.add_element(control_element)
|
80
159
|
end
|
81
160
|
end
|
82
161
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'test/unit'
|
2
2
|
require 'marc'
|
3
3
|
|
4
|
-
class
|
4
|
+
class XMLTest < Test::Unit::TestCase
|
5
5
|
|
6
6
|
def otest_batch
|
7
7
|
reader = MARC::XMLReader.new('test/batch.xml')
|
@@ -20,15 +20,18 @@ class XMLReaderTest < Test::Unit::TestCase
|
|
20
20
|
record1.append MARC::DataField.new('245', '0', '4',
|
21
21
|
['a', 'The Great Ray Charles'], ['h', '[sound recording].'])
|
22
22
|
|
23
|
-
writer = MARC::XMLWriter.new('test/
|
23
|
+
writer = MARC::XMLWriter.new('test/test.xml', :stylesheet => 'style.xsl')
|
24
24
|
writer.write(record1)
|
25
25
|
writer.close
|
26
26
|
|
27
|
-
|
27
|
+
xml = File.read('test/test.xml')
|
28
|
+
assert_match /<\?xml-stylesheet type="text\/xsl" href="style.xsl"\?>/, xml
|
29
|
+
|
30
|
+
reader = MARC::XMLReader.new('test/test.xml')
|
28
31
|
record2 = reader.entries[0]
|
29
32
|
assert_equal(record1, record2)
|
30
33
|
|
31
|
-
|
34
|
+
File.unlink('test/test.xml')
|
32
35
|
end
|
33
36
|
end
|
34
37
|
|
data/test/ts_marc.rb
CHANGED
metadata
CHANGED
@@ -3,11 +3,11 @@ rubygems_version: 0.8.11
|
|
3
3
|
specification_version: 1
|
4
4
|
name: marc
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.0.
|
7
|
-
date: 2006-
|
6
|
+
version: 0.0.9
|
7
|
+
date: 2006-03-28 00:00:00 -06:00
|
8
8
|
summary: A ruby library for working with Machine Readable Cataloging
|
9
9
|
require_paths:
|
10
|
-
- lib
|
10
|
+
- lib
|
11
11
|
email: ehs@pobox.com
|
12
12
|
homepage: http://www.textualize.com/ruby_marc
|
13
13
|
rubyforge_project:
|
@@ -18,52 +18,45 @@ bindir: bin
|
|
18
18
|
has_rdoc: true
|
19
19
|
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
20
|
requirements:
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
-
|
22
|
+
- ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
24
25
|
version:
|
25
26
|
platform: ruby
|
26
27
|
signing_key:
|
27
28
|
cert_chain:
|
28
29
|
authors:
|
29
|
-
- Ed Summers
|
30
|
+
- Ed Summers
|
30
31
|
files:
|
31
|
-
- lib/marc
|
32
|
-
- lib/marc.rb
|
33
|
-
- lib/marc/
|
34
|
-
- lib/marc/
|
35
|
-
- lib/marc/datafield.rb
|
36
|
-
- lib/marc/
|
37
|
-
- lib/marc/
|
38
|
-
- lib/marc/
|
39
|
-
- lib/marc/
|
40
|
-
- lib/marc/
|
41
|
-
- lib/marc/
|
42
|
-
- lib/marc/
|
43
|
-
- test/batch.dat
|
44
|
-
- test/batch.xml
|
45
|
-
- test/
|
46
|
-
- test/
|
47
|
-
- test/
|
48
|
-
- test/
|
49
|
-
- test/
|
50
|
-
- test/tc_subfield.rb
|
51
|
-
- test/
|
52
|
-
- test/
|
53
|
-
- test/
|
54
|
-
- test/tc_record.rb
|
55
|
-
- test/ts_marc.rb
|
32
|
+
- lib/marc
|
33
|
+
- lib/marc.rb
|
34
|
+
- lib/marc/constants.rb
|
35
|
+
- lib/marc/controlfield.rb
|
36
|
+
- lib/marc/datafield.rb
|
37
|
+
- lib/marc/exception.rb
|
38
|
+
- lib/marc/reader.rb
|
39
|
+
- lib/marc/record.rb
|
40
|
+
- lib/marc/subfield.rb
|
41
|
+
- lib/marc/writer.rb
|
42
|
+
- lib/marc/xmlreader.rb
|
43
|
+
- lib/marc/xmlwriter.rb
|
44
|
+
- test/batch.dat
|
45
|
+
- test/batch.xml
|
46
|
+
- test/one.dat
|
47
|
+
- test/tc_controlfield.rb
|
48
|
+
- test/tc_datafield.rb
|
49
|
+
- test/tc_reader.rb
|
50
|
+
- test/tc_record.rb
|
51
|
+
- test/tc_subfield.rb
|
52
|
+
- test/tc_writer.rb
|
53
|
+
- test/tc_xml.rb
|
54
|
+
- test/ts_marc.rb
|
56
55
|
test_files:
|
57
|
-
- test/ts_marc.rb
|
56
|
+
- test/ts_marc.rb
|
58
57
|
rdoc_options: []
|
59
|
-
|
60
58
|
extra_rdoc_files: []
|
61
|
-
|
62
59
|
executables: []
|
63
|
-
|
64
60
|
extensions: []
|
65
|
-
|
66
61
|
requirements: []
|
67
|
-
|
68
|
-
dependencies: []
|
69
|
-
|
62
|
+
dependencies: []
|
data/test/foo.xml
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
<?xml version='1.0'?><collection xmlns='http://www.loc.gov/MARC21/slim' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'><record><leader>00925njm 22002777a 4500</leader><controlfield tag='007'>sdubumennmplu</controlfield><datafield tag='245' ind1='0' ind2='4'><subfield code='a'>The Great Ray Charles</subfield><subfield code='h'>[sound recording].</subfield></datafield></record></collection>
|
data/test/tc_xmlwriter.rb
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
require 'marc'
|
3
|
-
|
4
|
-
class XMLWriterTest < Test::Unit::TestCase
|
5
|
-
|
6
|
-
def test_writer()
|
7
|
-
# get a record
|
8
|
-
reader = MARC::Reader.new('test/one.dat')
|
9
|
-
record = reader.entries[0]
|
10
|
-
|
11
|
-
str_writer = StringWriter.new()
|
12
|
-
xml_writer = MARC::XMLWriter.new(str_writer)
|
13
|
-
xml_writer.write(record)
|
14
|
-
assert_match /<\?xml version='1.0'\?>/, str_writer.buffer
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
# little class that enables wriing to a string
|
19
|
-
# like it's a file
|
20
|
-
|
21
|
-
class StringWriter
|
22
|
-
attr_reader :buffer
|
23
|
-
|
24
|
-
def initialize
|
25
|
-
@buffer = ''
|
26
|
-
end
|
27
|
-
|
28
|
-
def write(str)
|
29
|
-
@buffer += str
|
30
|
-
end
|
31
|
-
|
32
|
-
def to_s
|
33
|
-
return @buffer
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
|