marc 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/marc/reader.rb CHANGED
@@ -12,7 +12,10 @@ module MARC
12
12
  # fh = File.new('marc.dat')
13
13
  # reader = MARC::Reader.new(fh)
14
14
  #
15
- # or really any object that responds to read(n).
15
+ # or really any object that responds to read(n)
16
+ #
17
+ # # marc is a string with a bunch of records in it
18
+ # reader = MARC::Reader.new(StringIO.new(reader))
16
19
 
17
20
  def initialize(file)
18
21
  if file.class == String:
data/lib/marc/record.rb CHANGED
@@ -39,7 +39,7 @@ module MARC
39
39
  # title = record.find {|f| f.tag == '245'}
40
40
  #
41
41
  # getting all subjects
42
- # subjects = record.find_all {|f| ('600'..'699' === f.tag)}
42
+ # subjects = record.find_all {|f| ('600'..'699') === f.tag}
43
43
 
44
44
  def each
45
45
  for field in @fields
@@ -1,4 +1,5 @@
1
1
  require 'rexml/document'
2
+ require 'rexml/text'
2
3
 
3
4
  module MARC
4
5
 
@@ -8,8 +9,13 @@ module MARC
8
9
 
9
10
  # the constructor which you must pass a file path
10
11
  # or an object that responds to a write message
12
+ # the second argument is a hash of options, currently
13
+ # only supporting one option, stylesheet
14
+ #
15
+ # writer = XMLWriter.new 'marc.xml', :stylesheet => 'style.xsl'
16
+ # writer.write record
11
17
 
12
- def initialize(file)
18
+ def initialize(file, opts={})
13
19
  if file.class == String
14
20
  @fh = File.new(file,"w")
15
21
  elsif file.respond_to?('write')
@@ -19,17 +25,22 @@ module MARC
19
25
  end
20
26
 
21
27
  @fh.write("<?xml version='1.0'?>")
22
-
28
+ if opts[:stylesheet]
29
+ @fh.write(
30
+ %Q{<?xml-stylesheet type="text/xsl" href="#{opts[:stylesheet]}"?>})
31
+ end
23
32
  @fh.write("<collection xmlns='" + MARC_NS + "' " +
24
33
  "xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' " +
25
34
  "xsi:schemaLocation='" + MARC_NS + " " + MARC_XSD + "'>")
35
+ @fh.write("\n")
26
36
  end
27
37
 
28
38
 
29
39
  # write a record to the file or handle
30
40
 
31
41
  def write(record)
32
- @fh.write(MARC::XMLWriter.encode(record).to_s)
42
+ MARC::XMLWriter.encode(record).write(@fh, 0)
43
+ @fh.write("\n");
33
44
  end
34
45
 
35
46
 
@@ -40,24 +51,77 @@ module MARC
40
51
  @fh.close
41
52
  end
42
53
 
54
+
55
+ # Converts from ISO 8859-1 to UTF-8, normalizes the UTF-8, and puts a
56
+ # 'clean up marker' in records that have control characters (which are
57
+ # not valid in XML). This is useful for locating these records once
58
+ # they are in XML so problems caused by removing the invalid characters
59
+ # can be fixed by a person. This (or something in the module) needs to
60
+ # convert from MARC-8 to UTF-8, but it doesn't do this yet...
61
+
62
+ def self.convert_to_utf8(text)
63
+ cleaned_text = text.gsub(/[\x00-\x1f\x7f-\xff]+/, ' CLEAN_ME_UP ')
64
+ utf8_text = cleaned_text.unpack('C*').pack('U*')
65
+ normalized_text = REXML::Text::normalize(utf8_text)
66
+
67
+ return normalized_text
68
+ end
43
69
 
44
70
  # a static method that accepts a MARC::Record object
45
- # and returns a REXML::Document for the XML serialization
71
+ # and returns a REXML::Document for the XML serialization.
46
72
 
47
73
  def self.encode(record)
74
+ singleChar = Regexp.new(/[\da-z ]{1}/)
75
+ ctrlFieldTag = Regexp.new(/00[1-9A-Za-z]{1}s/)
76
+
77
+ # Right now, this writer handles input from the strict and
78
+ # lenient MARC readers. Because it can get 'loose' MARC in, it
79
+ # attempts to do some cleanup on data values that are not valid
80
+ # MARCXML.
81
+
82
+ # TODO? Perhaps the 'loose MARC' checks should be split out
83
+ # into a tolerant MARCXMLWriter allowing the main one to skip
84
+ # this extra work.
85
+
86
+ # TODO: At the very least there should be some logging
87
+ # to record our attempts to account for less than perfect MARC.
88
+
48
89
  root = "<record/>"
49
- doc = REXML::Document.new root
90
+ doc = REXML::Document.new(root)
91
+
92
+ # MARCXML only allows alphanumerics or spaces in the leader
93
+ record.leader.gsub!(/[^\w|^\s]/, 'Z')
94
+
95
+ # MARCXML is particular about last four characters; ILSes aren't
96
+ if (record.leader[20..23] != "4500")
97
+ record.leader[20..23] = "4500"
98
+ end
50
99
 
51
- # MARCXML is particular about this; ILSes aren't
52
- record.leader[20..24] = "4500"
100
+ # MARCXML doesn't like a space here so we need a filler character: Z
101
+ if (record.leader[6..6] == " ")
102
+ record.leader[6..6] = "Z"
103
+ end
53
104
 
54
- leader = REXML::Element.new "leader"
55
- leader.add_text record.leader
56
- doc.root.add_element leader
105
+ leader = REXML::Element.new("leader")
106
+ leader.add_text(record.leader)
107
+ doc.root.add_element(leader)
57
108
 
58
109
  for field in record.fields
59
110
  if field.class == MARC::DataField
60
- datafield_elem = REXML::Element.new "datafield"
111
+ datafield_elem = REXML::Element.new("datafield")
112
+
113
+ # If marc is leniently parsed, we may have some dirty data; using
114
+ # the 'z' ind1 value should help us locate these later to fix
115
+ if (field.indicator1.match(singleChar) == nil)
116
+ field.indicator1 = 'z'
117
+ end
118
+
119
+ # If marc is leniently parsed, we may have some dirty data; using
120
+ # the 'z' ind2 value should help us locate these later to fix
121
+ if (field.indicator2.match(singleChar) == nil)
122
+ field.indicator2 = 'z'
123
+ end
124
+
61
125
  datafield_elem.add_attributes({
62
126
  "tag"=>field.tag,
63
127
  "ind1"=>field.indicator1,
@@ -65,18 +129,33 @@ module MARC
65
129
  })
66
130
 
67
131
  for subfield in field.subfields
68
- subfield_element = REXML::Element.new "subfield"
132
+ subfield_element = REXML::Element.new("subfield")
133
+
134
+ # If marc is leniently parsed, we may have some dirty data; using
135
+ # the blank subfield code should help us locate these later to fix
136
+ if (subfield.code.match(singleChar) == nil)
137
+ subfield.code = ' '
138
+ end
139
+
69
140
  subfield_element.add_attribute("code", subfield.code)
70
- subfield_element.add_text subfield.value
71
- datafield_elem.add_element subfield_element
141
+ text = MARC::XMLWriter.convert_to_utf8(subfield.value)
142
+ subfield_element.add_text(text)
143
+ datafield_elem.add_element(subfield_element)
72
144
  end
73
145
 
74
146
  doc.root.add_element datafield_elem
75
147
  elsif field.class == MARC::ControlField
76
- control_element = REXML::Element.new "controlfield"
148
+ control_element = REXML::Element.new("controlfield")
149
+
150
+ # We need a marker for invalid tag values (we use 000)
151
+ if (field.tag.match(ctrlFieldTag) == nil)
152
+ field.tag = "00z"
153
+ end
154
+
77
155
  control_element.add_attribute("tag", field.tag)
78
- control_element.add_text field.value
79
- doc.root.add_element control_element
156
+ text = MARC::XMLWriter.convert_to_utf8(field.value)
157
+ control_element.add_text(text)
158
+ doc.root.add_element(control_element)
80
159
  end
81
160
  end
82
161
 
@@ -1,7 +1,7 @@
1
1
  require 'test/unit'
2
2
  require 'marc'
3
3
 
4
- class XMLReaderTest < Test::Unit::TestCase
4
+ class XMLTest < Test::Unit::TestCase
5
5
 
6
6
  def otest_batch
7
7
  reader = MARC::XMLReader.new('test/batch.xml')
@@ -20,15 +20,18 @@ class XMLReaderTest < Test::Unit::TestCase
20
20
  record1.append MARC::DataField.new('245', '0', '4',
21
21
  ['a', 'The Great Ray Charles'], ['h', '[sound recording].'])
22
22
 
23
- writer = MARC::XMLWriter.new('test/foo.xml')
23
+ writer = MARC::XMLWriter.new('test/test.xml', :stylesheet => 'style.xsl')
24
24
  writer.write(record1)
25
25
  writer.close
26
26
 
27
- reader = MARC::XMLReader.new('test/foo.xml')
27
+ xml = File.read('test/test.xml')
28
+ assert_match /<\?xml-stylesheet type="text\/xsl" href="style.xsl"\?>/, xml
29
+
30
+ reader = MARC::XMLReader.new('test/test.xml')
28
31
  record2 = reader.entries[0]
29
32
  assert_equal(record1, record2)
30
33
 
31
- #File.unlink('test/foo.xml')
34
+ File.unlink('test/test.xml')
32
35
  end
33
36
  end
34
37
 
data/test/ts_marc.rb CHANGED
@@ -11,5 +11,4 @@ require 'test/tc_controlfield'
11
11
  require 'test/tc_record'
12
12
  require 'test/tc_reader'
13
13
  require 'test/tc_writer'
14
- require 'test/tc_xmlwriter'
15
- require 'test/tc_xmlreader'
14
+ require 'test/tc_xml'
metadata CHANGED
@@ -3,11 +3,11 @@ rubygems_version: 0.8.11
3
3
  specification_version: 1
4
4
  name: marc
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.0.8
7
- date: 2006-01-16 00:00:00 -05:00
6
+ version: 0.0.9
7
+ date: 2006-03-28 00:00:00 -06:00
8
8
  summary: A ruby library for working with Machine Readable Cataloging
9
9
  require_paths:
10
- - lib
10
+ - lib
11
11
  email: ehs@pobox.com
12
12
  homepage: http://www.textualize.com/ruby_marc
13
13
  rubyforge_project:
@@ -18,52 +18,45 @@ bindir: bin
18
18
  has_rdoc: true
19
19
  required_ruby_version: !ruby/object:Gem::Version::Requirement
20
20
  requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
21
+ -
22
+ - ">"
23
+ - !ruby/object:Gem::Version
24
+ version: 0.0.0
24
25
  version:
25
26
  platform: ruby
26
27
  signing_key:
27
28
  cert_chain:
28
29
  authors:
29
- - Ed Summers
30
+ - Ed Summers
30
31
  files:
31
- - lib/marc
32
- - lib/marc.rb
33
- - lib/marc/writer.rb
34
- - lib/marc/subfield.rb
35
- - lib/marc/datafield.rb
36
- - lib/marc/record.rb
37
- - lib/marc/constants.rb
38
- - lib/marc/xmlreader.rb
39
- - lib/marc/xmlwriter.rb
40
- - lib/marc/exception.rb
41
- - lib/marc/controlfield.rb
42
- - lib/marc/reader.rb
43
- - test/batch.dat
44
- - test/batch.xml
45
- - test/tc_datafield.rb
46
- - test/tc_reader.rb
47
- - test/foo.xml
48
- - test/tc_writer.rb
49
- - test/tc_controlfield.rb
50
- - test/tc_subfield.rb
51
- - test/one.dat
52
- - test/tc_xmlreader.rb
53
- - test/tc_xmlwriter.rb
54
- - test/tc_record.rb
55
- - test/ts_marc.rb
32
+ - lib/marc
33
+ - lib/marc.rb
34
+ - lib/marc/constants.rb
35
+ - lib/marc/controlfield.rb
36
+ - lib/marc/datafield.rb
37
+ - lib/marc/exception.rb
38
+ - lib/marc/reader.rb
39
+ - lib/marc/record.rb
40
+ - lib/marc/subfield.rb
41
+ - lib/marc/writer.rb
42
+ - lib/marc/xmlreader.rb
43
+ - lib/marc/xmlwriter.rb
44
+ - test/batch.dat
45
+ - test/batch.xml
46
+ - test/one.dat
47
+ - test/tc_controlfield.rb
48
+ - test/tc_datafield.rb
49
+ - test/tc_reader.rb
50
+ - test/tc_record.rb
51
+ - test/tc_subfield.rb
52
+ - test/tc_writer.rb
53
+ - test/tc_xml.rb
54
+ - test/ts_marc.rb
56
55
  test_files:
57
- - test/ts_marc.rb
56
+ - test/ts_marc.rb
58
57
  rdoc_options: []
59
-
60
58
  extra_rdoc_files: []
61
-
62
59
  executables: []
63
-
64
60
  extensions: []
65
-
66
61
  requirements: []
67
-
68
- dependencies: []
69
-
62
+ dependencies: []
data/test/foo.xml DELETED
@@ -1 +0,0 @@
1
- <?xml version='1.0'?><collection xmlns='http://www.loc.gov/MARC21/slim' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xsi:schemaLocation='http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd'><record><leader>00925njm 22002777a 4500</leader><controlfield tag='007'>sdubumennmplu</controlfield><datafield tag='245' ind1='0' ind2='4'><subfield code='a'>The Great Ray Charles</subfield><subfield code='h'>[sound recording].</subfield></datafield></record></collection>
data/test/tc_xmlwriter.rb DELETED
@@ -1,37 +0,0 @@
1
- require 'test/unit'
2
- require 'marc'
3
-
4
- class XMLWriterTest < Test::Unit::TestCase
5
-
6
- def test_writer()
7
- # get a record
8
- reader = MARC::Reader.new('test/one.dat')
9
- record = reader.entries[0]
10
-
11
- str_writer = StringWriter.new()
12
- xml_writer = MARC::XMLWriter.new(str_writer)
13
- xml_writer.write(record)
14
- assert_match /<\?xml version='1.0'\?>/, str_writer.buffer
15
- end
16
- end
17
-
18
- # little class that enables wriing to a string
19
- # like it's a file
20
-
21
- class StringWriter
22
- attr_reader :buffer
23
-
24
- def initialize
25
- @buffer = ''
26
- end
27
-
28
- def write(str)
29
- @buffer += str
30
- end
31
-
32
- def to_s
33
- return @buffer
34
- end
35
- end
36
-
37
-