marc 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
data/lib/marc/reader.rb CHANGED
@@ -12,7 +12,10 @@ module MARC
12
12
  # fh = File.new('marc.dat')
13
13
  # reader = MARC::Reader.new(fh)
14
14
  #
15
- # or really any object that responds to read(n).
15
+ # or really any object that responds to read(n)
16
+ #
17
+ # # marc is a string with a bunch of records in it
18
+ # reader = MARC::Reader.new(StringIO.new(reader))
16
19
 
17
20
  def initialize(file)
18
21
  if file.class == String:
data/lib/marc/record.rb CHANGED
@@ -39,7 +39,7 @@ module MARC
39
39
  # title = record.find {|f| f.tag == '245'}
40
40
  #
41
41
  # getting all subjects
42
- # subjects = record.find_all {|f| ('600'..'699' === f.tag)}
42
+ # subjects = record.find_all {|f| ('600'..'699') === f.tag}
43
43
 
44
44
  def each
45
45
  for field in @fields
@@ -1,4 +1,5 @@
1
1
  require 'rexml/document'
2
+ require 'rexml/text'
2
3
 
3
4
  module MARC
4
5
 
@@ -8,8 +9,13 @@ module MARC
8
9
 
9
10
  # the constructor which you must pass a file path
10
11
  # or an object that responds to a write message
12
+ # the second argument is a hash of options, currently
13
+ # only supporting one option, stylesheet
14
+ #
15
+ # writer = XMLWriter.new 'marc.xml', :stylesheet => 'style.xsl'
16
+ # writer.write record
11
17
 
12
- def initialize(file)
18
+ def initialize(file, opts={})
13
19
  if file.class == String
14
20
  @fh = File.new(file,"w")
15
21
  elsif file.respond_to?('write')
@@ -19,17 +25,22 @@ module MARC
19
25
  end
20
26
 
21
27
  @fh.write("<?xml version='1.0'?>")
22
-
28
+ if opts[:stylesheet]
29
+ @fh.write(
30
+ %Q{<?xml-stylesheet type="text/xsl" href="#{opts[:stylesheet]}"?>})
31
+ end
23
32
  @fh.write("<collection xmlns='" + MARC_NS + "' " +
24
33
  "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " +
25
34
  "xsi:schemaLocation='" + MARC_NS + " " + MARC_XSD + "'>")
35
+ @fh.write("\n")
26
36
  end
27
37
 
28
38
 
29
39
  # write a record to the file or handle
30
40
 
31
41
  def write(record)
32
- @fh.write(MARC::XMLWriter.encode(record).to_s)
42
+ MARC::XMLWriter.encode(record).write(@fh, 0)
43
+ @fh.write("\n");
33
44
  end
34
45
 
35
46
 
@@ -40,24 +51,77 @@ module MARC
40
51
  @fh.close
41
52
  end
42
53
 
54
+
55
+ # Converts from ISO 8859-1 to UTF-8, normalizes the UTF-8, and puts a
56
+ # 'clean up marker' in records that have control characters (which are
57
+ # not valid in XML). This is useful for locating these records once
58
+ # they are in XML so problems caused by removing the invalid characters
59
+ # can be fixed by a person. This (or something in the module) needs to
60
+ # convert from MARC-8 to UTF-8, but it doesn't do this yet...
61
+
62
+ def self.convert_to_utf8(text)
63
+ cleaned_text = text.gsub(/[\x00-\x1f\x7f-\xff]+/, ' CLEAN_ME_UP ')
64
+ utf8_text = cleaned_text.unpack('C*').pack('U*')
65
+ normalized_text = REXML::Text::normalize(utf8_text)
66
+
67
+ return normalized_text
68
+ end
43
69
 
44
70
  # a static method that accepts a MARC::Record object
45
- # and returns a REXML::Document for the XML serialization
71
+ # and returns a REXML::Document for the XML serialization.
46
72
 
47
73
  def self.encode(record)
74
+ singleChar = Regexp.new(/[\da-z ]{1}/)
75
+ ctrlFieldTag = Regexp.new(/00[1-9A-Za-z]{1}s/)
76
+
77
+ # Right now, this writer handles input from the strict and
78
+ # lenient MARC readers. Because it can get 'loose' MARC in, it
79
+ # attempts to do some cleanup on data values that are not valid
80
+ # MARCXML.
81
+
82
+ # TODO? Perhaps the 'loose MARC' checks should be split out
83
+ # into a tolerant MARCXMLWriter allowing the main one to skip
84
+ # this extra work.
85
+
86
+ # TODO: At the very least there should be some logging
87
+ # to record our attempts to account for less than perfect MARC.
88
+
48
89
  root = "<record/>"
49
- doc = REXML::Document.new root
90
+ doc = REXML::Document.new(root)
91
+
92
+ # MARCXML only allows alphanumerics or spaces in the leader
93
+ record.leader.gsub!(/[^\w|^\s]/, 'Z')
94
+
95
+ # MARCXML is particular about last four characters; ILSes aren't
96
+ if (record.leader[20..23] != "4500")
97
+ record.leader[20..23] = "4500"
98
+ end
50
99
 
51
- # MARCXML is particular about this; ILSes aren't
52
- record.leader[20..24] = "4500"
100
+ # MARCXML doesn't like a space here so we need a filler character: Z
101
+ if (record.leader[6..6] == " ")
102
+ record.leader[6..6] = "Z"
103
+ end
53
104
 
54
- leader = REXML::Element.new "leader"
55
- leader.add_text record.leader
56
- doc.root.add_element leader
105
+ leader = REXML::Element.new("leader")
106
+ leader.add_text(record.leader)
107
+ doc.root.add_element(leader)
57
108
 
58
109
  for field in record.fields
59
110
  if field.class == MARC::DataField
60
- datafield_elem = REXML::Element.new "datafield"
111
+ datafield_elem = REXML::Element.new("datafield")
112
+
113
+ # If marc is leniently parsed, we may have some dirty data; using
114
+ # the 'z' ind1 value should help us locate these later to fix
115
+ if (field.indicator1.match(singleChar) == nil)
116
+ field.indicator1 = 'z'
117
+ end
118
+
119
+ # If marc is leniently parsed, we may have some dirty data; using
120
+ # the 'z' ind2 value should help us locate these later to fix
121
+ if (field.indicator2.match(singleChar) == nil)
122
+ field.indicator2 = 'z'
123
+ end
124
+
61
125
  datafield_elem.add_attributes({
62
126
  "tag"=>field.tag,
63
127
  "ind1"=>field.indicator1,
@@ -65,18 +129,33 @@ module MARC
65
129
  })
66
130
 
67
131
  for subfield in field.subfields
68
- subfield_element = REXML::Element.new "subfield"
132
+ subfield_element = REXML::Element.new("subfield")
133
+
134
+ # If marc is leniently parsed, we may have some dirty data; using
135
+ # the blank subfield code should help us locate these later to fix
136
+ if (subfield.code.match(singleChar) == nil)
137
+ subfield.code = ' '
138
+ end
139
+
69
140
  subfield_element.add_attribute("code", subfield.code)
70
- subfield_element.add_text subfield.value
71
- datafield_elem.add_element subfield_element
141
+ text = MARC::XMLWriter.convert_to_utf8(subfield.value)
142
+ subfield_element.add_text(text)
143
+ datafield_elem.add_element(subfield_element)
72
144
  end
73
145
 
74
146
  doc.root.add_element datafield_elem
75
147
  elsif field.class == MARC::ControlField
76
- control_element = REXML::Element.new "controlfield"
148
+ control_element = REXML::Element.new("controlfield")
149
+
150
+ # We need a marker for invalid tag values (we use 000)
151
+ if (field.tag.match(ctrlFieldTag) == nil)
152
+ field.tag = "00z"
153
+ end
154
+
77
155
  control_element.add_attribute("tag", field.tag)
78
- control_element.add_text field.value
79
- doc.root.add_element control_element
156
+ text = MARC::XMLWriter.convert_to_utf8(field.value)
157
+ control_element.add_text(text)
158
+ doc.root.add_element(control_element)
80
159
  end
81
160
  end
82
161
 
@@ -1,7 +1,7 @@
1
1
  require 'test/unit'
2
2
  require 'marc'
3
3
 
4
- class XMLReaderTest < Test::Unit::TestCase
4
+ class XMLTest < Test::Unit::TestCase
5
5
 
6
6
  def otest_batch
7
7
  reader = MARC::XMLReader.new('test/batch.xml')
@@ -20,15 +20,18 @@ class XMLReaderTest < Test::Unit::TestCase
20
20
  record1.append MARC::DataField.new('245', '0', '4',
21
21
  ['a', 'The Great Ray Charles'], ['h', '[sound recording].'])
22
22
 
23
- writer = MARC::XMLWriter.new('test/foo.xml')
23
+ writer = MARC::XMLWriter.new('test/test.xml', :stylesheet => 'style.xsl')
24
24
  writer.write(record1)
25
25
  writer.close
26
26
 
27
- reader = MARC::XMLReader.new('test/foo.xml')
27
+ xml = File.read('test/test.xml')
28
+ assert_match /<\?xml-stylesheet type="text\/xsl" href="style.xsl"\?>/, xml
29
+
30
+ reader = MARC::XMLReader.new('test/test.xml')
28
31
  record2 = reader.entries[0]
29
32
  assert_equal(record1, record2)
30
33
 
31
- #File.unlink('test/foo.xml')
34
+ File.unlink('test/test.xml')
32
35
  end
33
36
  end
34
37
 
data/test/ts_marc.rb CHANGED
@@ -11,5 +11,4 @@ require 'test/tc_controlfield'
11
11
  require 'test/tc_record'
12
12
  require 'test/tc_reader'
13
13
  require 'test/tc_writer'
14
- require 'test/tc_xmlwriter'
15
- require 'test/tc_xmlreader'
14
+ require 'test/tc_xml'
metadata CHANGED
@@ -3,11 +3,11 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: marc
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.0.8
7
- date: 2006-01-16 00:00:00 -05:00
6
+ version: 0.0.9
7
+ date: 2006-03-28 00:00:00 -06:00
8
8
  summary: A ruby library for working with Machine Readable Cataloging
9
9
  require_paths:
10
- - lib
10
+ - lib
11
11
  email: ehs@pobox.com
12
12
  homepage: http://www.textualize.com/ruby_marc
13
13
  rubyforge_project:
@@ -18,52 +18,45 @@ bindir: bin
18
18
  has_rdoc: true
19
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
20
20
  requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
24
25
  version:
25
26
  platform: ruby
26
27
  signing_key:
27
28
  cert_chain:
28
29
  authors:
29
- - Ed Summers
30
+ - Ed Summers
30
31
  files:
31
- - lib/marc
32
- - lib/marc.rb
33
- - lib/marc/writer.rb
34
- - lib/marc/subfield.rb
35
- - lib/marc/datafield.rb
36
- - lib/marc/record.rb
37
- - lib/marc/constants.rb
38
- - lib/marc/xmlreader.rb
39
- - lib/marc/xmlwriter.rb
40
- - lib/marc/exception.rb
41
- - lib/marc/controlfield.rb
42
- - lib/marc/reader.rb
43
- - test/batch.dat
44
- - test/batch.xml
45
- - test/tc_datafield.rb
46
- - test/tc_reader.rb
47
- - test/foo.xml
48
- - test/tc_writer.rb
49
- - test/tc_controlfield.rb
50
- - test/tc_subfield.rb
51
- - test/one.dat
52
- - test/tc_xmlreader.rb
53
- - test/tc_xmlwriter.rb
54
- - test/tc_record.rb
55
- - test/ts_marc.rb
32
+ - lib/marc
33
+ - lib/marc.rb
34
+ - lib/marc/constants.rb
35
+ - lib/marc/controlfield.rb
36
+ - lib/marc/datafield.rb
37
+ - lib/marc/exception.rb
38
+ - lib/marc/reader.rb
39
+ - lib/marc/record.rb
40
+ - lib/marc/subfield.rb
41
+ - lib/marc/writer.rb
42
+ - lib/marc/xmlreader.rb
43
+ - lib/marc/xmlwriter.rb
44
+ - test/batch.dat
45
+ - test/batch.xml
46
+ - test/one.dat
47
+ - test/tc_controlfield.rb
48
+ - test/tc_datafield.rb
49
+ - test/tc_reader.rb
50
+ - test/tc_record.rb
51
+ - test/tc_subfield.rb
52
+ - test/tc_writer.rb
53
+ - test/tc_xml.rb
54
+ - test/ts_marc.rb
56
55
  test_files:
57
- - test/ts_marc.rb
56
+ - test/ts_marc.rb
58
57
  rdoc_options: []
59
-
60
58
  extra_rdoc_files: []
61
-
62
59
  executables: []
63
-
64
60
  extensions: []
65
-
66
61
  requirements: []
67
-
68
- dependencies: []
69
-
62
+ dependencies: []
data/test/foo.xml DELETED
@@ -1 +0,0 @@
1
- <?xml version='1.0'?><collection xmlns='http://www.loc.gov/MARC21/slim' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'><record><leader>00925njm 22002777a 4500</leader><controlfield tag='007'>sdubumennmplu</controlfield><datafield tag='245' ind1='0' ind2='4'><subfield code='a'>The Great Ray Charles</subfield><subfield code='h'>[sound recording].</subfield></datafield></record></collection>
data/test/tc_xmlwriter.rb DELETED
@@ -1,37 +0,0 @@
1
- require 'test/unit'
2
- require 'marc'
3
-
4
- class XMLWriterTest < Test::Unit::TestCase
5
-
6
- def test_writer()
7
- # get a record
8
- reader = MARC::Reader.new('test/one.dat')
9
- record = reader.entries[0]
10
-
11
- str_writer = StringWriter.new()
12
- xml_writer = MARC::XMLWriter.new(str_writer)
13
- xml_writer.write(record)
14
- assert_match /<\?xml version='1.0'\?>/, str_writer.buffer
15
- end
16
- end
17
-
18
- # little class that enables wriing to a string
19
- # like it's a file
20
-
21
- class StringWriter
22
- attr_reader :buffer
23
-
24
- def initialize
25
- @buffer = ''
26
- end
27
-
28
- def write(str)
29
- @buffer += str
30
- end
31
-
32
- def to_s
33
- return @buffer
34
- end
35
- end
36
-
37
-