marc 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/Changes CHANGED
@@ -1,3 +1,6 @@
1
+ v0.3.0 Wed Sep 23 21:51:00 EDT 2009
2
+ - Nokogiri and jrexml parser integration added as well as Ruby 1.9 support
3
+
1
4
  v0.2.2 Tue Dec 30 09:50:33 EST 2008
2
5
  - DataField tags that are all numeric are now padded with leading zeros
3
6
 
data/Rakefile CHANGED
@@ -1,4 +1,4 @@
1
- RUBY_MARC_VERSION = '0.2.2'
1
+ RUBY_MARC_VERSION = '0.3.0'
2
2
 
3
3
  require 'rubygems'
4
4
  require 'rake'
@@ -21,7 +21,7 @@ spec = Gem::Specification.new do |s|
21
21
  s.version = RUBY_MARC_VERSION
22
22
  s.author = 'Ed Summers'
23
23
  s.email = 'ehs@pobox.com'
24
- s.homepage = 'http://www.textualize.com/ruby_marc'
24
+ s.homepage = 'http://marc.rubyforge.org/'
25
25
  s.platform = Gem::Platform::RUBY
26
26
  s.summary = 'A ruby library for working with Machine Readable Cataloging'
27
27
  s.files = Dir.glob("{lib,test}/**/*") + ["Rakefile", "README", "Changes",
@@ -30,7 +30,7 @@ spec = Gem::Specification.new do |s|
30
30
  s.autorequire = 'marc'
31
31
  s.has_rdoc = true
32
32
  s.required_ruby_version = '>= 1.8.6'
33
-
33
+ s.authors = ["Kevin Clarke", "William Groppe", "Ross Singer", "Ed Summers"]
34
34
  s.test_file = 'test/ts_marc.rb'
35
35
  s.bindir = 'bin'
36
36
  end
@@ -41,7 +41,7 @@ Rake::GemPackageTask.new(spec) do |pkg|
41
41
  end
42
42
 
43
43
  Rake::RDocTask.new('doc') do |rd|
44
- rd.rdoc_files.include("lib/**/*.rb")
44
+ rd.rdoc_files.include("README", "Changes", "LICENSE", "lib/**/*.rb")
45
45
  rd.main = 'MARC::Record'
46
46
  rd.rdoc_dir = 'doc'
47
47
  end
data/lib/marc.rb CHANGED
@@ -24,6 +24,13 @@
24
24
  # writer = MARC::XMLWriter.new('marc.xml')
25
25
  # writer.write(record)
26
26
  # writer.close()
27
+ #
28
+ # # Deal with non-standard control field tags
29
+ # MARC::Field.control_tags << 'FMT'
30
+ # record = MARC::Record.new()
31
+ # record.add_field(MARC::ControlField.new('FMT', 'Book')) # doesn't throw an error
32
+
33
+
27
34
 
28
35
  require 'marc/constants'
29
36
  require 'marc/record'
@@ -36,3 +43,4 @@ require 'marc/exception'
36
43
  require 'marc/xmlwriter'
37
44
  require 'marc/xmlreader'
38
45
  require 'marc/dublincore'
46
+ require 'marc/xml_parsers'
@@ -11,4 +11,8 @@ module MARC
11
11
  MARC_NS = "http://www.loc.gov/MARC21/slim"
12
12
  MARC_XSD = "http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"
13
13
 
14
+ # marc-hash
15
+ MARCHASH_MAJOR_VERSION = 1
16
+ MARCHASH_MINOR_VERSION = 0
17
+
14
18
  end
@@ -1,10 +1,29 @@
1
+ require 'set'
2
+
1
3
  module MARC
2
4
 
3
5
  # MARC records contain control fields, each of which has a
4
6
  # tag and value. Tags for control fields must be in the
5
- # 001-009 range.
7
+ # 001-009 range or be specially added to the @@control_tags Set
6
8
 
7
9
  class ControlField
10
+
11
+ # Initially, control tags are the numbers 1 through 9 or the string '000'
12
+ @@control_tags = Set.new( (1..9).to_a)
13
+ @@control_tags << '000'
14
+
15
+ def self.control_tags
16
+ return @@control_tags
17
+ end
18
+
19
+ # A tag is a control tag if it is a member of the @@control_tags set
20
+ # as either a string (e.g., 'FMT') or in its .to_i representation
21
+ # (e.g., '008'.to_i == 3 is in @@control_tags by default)
22
+
23
+ def self.control_tag?(tag)
24
+ return (@@control_tags.include?(tag.to_i) or @@control_tags.include?(tag))
25
+ end
26
+
8
27
 
9
28
  # the tag value (007, 008, etc)
10
29
  attr_accessor :tag
@@ -18,8 +37,8 @@ module MARC
18
37
  def initialize(tag,value='')
19
38
  @tag = tag
20
39
  @value = value
21
- if tag.to_i > 9
22
- raise MARC::Exception.new(), "tag must be greater than 009"
40
+ if not MARC::ControlField.control_tag?(@tag)
41
+ raise MARC::Exception.new(), "tag must be in 001-009 or in the MARC::ControlField.control_tags set"
23
42
  end
24
43
  end
25
44
 
@@ -34,13 +53,19 @@ module MARC
34
53
  return true
35
54
  end
36
55
 
56
+ # turning it into a marc-hash element
57
+ def to_marchash
58
+ return [@tag, @value]
59
+ end
60
+
61
+
37
62
  def to_s
38
63
  return "#{tag} #{value}"
39
64
  end
40
65
 
41
66
  def =~(regex)
42
67
  return self.to_s =~ regex
43
- end
68
+ end
44
69
 
45
70
  end
46
71
 
@@ -1,11 +1,14 @@
1
1
  require 'marc/subfield'
2
2
  require 'marc/record'
3
+ require 'marc/controlfield'
3
4
 
4
5
  module MARC
5
6
 
6
7
  # MARC records contain data fields, each of which has a tag,
7
- # indicators and subfields. Tags for data fields must be in
8
- # the range 010-999.
8
+ # indicators and subfields. Tags for data fields must are all
9
+ # three-character tags that are not control fields (generally,
10
+ # any numeric tag greater than 009).
11
+ #
9
12
  # Accessor attributes: tag, indicator1, indicator2
10
13
  #
11
14
  # DataField mixes in Enumerable to enable access to it's constituent
@@ -63,10 +66,12 @@ module MARC
63
66
  @indicator2 = i2 == nil ? ' ' : i2
64
67
  @subfields = []
65
68
 
66
- # must use MARC::ControlField for tags < 010
67
- if @tag.to_i < 10 and not @tag =~ /[A-z]/
69
+ # must use MARC::ControlField for tags < 010 or
70
+ # those in MARC::ControlField#extra_control_fields
71
+
72
+ if MARC::ControlField.control_tag?(@tag)
68
73
  raise MARC::Exception.new(),
69
- "MARC::DataField objects can't have tags < 010"
74
+ "MARC::DataField objects can't have ControlField tag '" + @tag + "')"
70
75
  end
71
76
 
72
77
  # allows MARC::Subfield objects to be passed directly
@@ -78,7 +83,7 @@ module MARC
78
83
  when Array
79
84
  if subfield.length > 2
80
85
  raise MARC::Exception.new(),
81
- "arrays must only have 2 elements"
86
+ "arrays must only have 2 elements: " + subfield.to_s
82
87
  end
83
88
  @subfields.push(
84
89
  MARC::Subfield.new(subfield[0],subfield[1]))
@@ -100,6 +105,11 @@ module MARC
100
105
  return str
101
106
  end
102
107
 
108
+ # Turn into a marc-hash structure
109
+ def to_marchash
110
+ return [@tag, @indicator1, @indicator2, @subfields.map {|sf| [sf.code, sf.value]} ]
111
+ end
112
+
103
113
 
104
114
  # Add a subfield (MARC::Subfield) to the field
105
115
  # field.append(MARC::Subfield.new('a','Dave Thomas'))
data/lib/marc/reader.rb CHANGED
@@ -16,9 +16,15 @@ module MARC
16
16
  #
17
17
  # # marc is a string with a bunch of records in it
18
18
  # reader = MARC::Reader.new(StringIO.new(reader))
19
+ #
20
+ # If your data have non-standard control fields in them
21
+ # (e.g., Aleph's 'FMT') you need to add them specifically
22
+ # to the MARC::ControlField.control_tags Set object
23
+ #
24
+ # MARC::ControlField.control_tags << 'FMT'
19
25
 
20
26
  def initialize(file)
21
- if file.class == String:
27
+ if file.is_a?(String)
22
28
  @handle = File.new(file)
23
29
  elsif file.respond_to?("read", 5)
24
30
  @handle = file
@@ -40,7 +46,7 @@ module MARC
40
46
  while rec_length_s = @handle.read(5)
41
47
  # make sure the record length looks like an integer
42
48
  rec_length_i = rec_length_s.to_i
43
- if rec_length_i == 0:
49
+ if rec_length_i == 0
44
50
  raise MARC::Exception.new("invalid record length: #{rec_length_s}")
45
51
  end
46
52
 
@@ -113,7 +119,7 @@ module MARC
113
119
  field_data.delete!(END_OF_FIELD)
114
120
 
115
121
  # add a control field or data field
116
- if tag < '010'
122
+ if MARC::ControlField.control_tag?(tag)
117
123
  record.append(MARC::ControlField.new(tag,field_data))
118
124
  else
119
125
  field = MARC::DataField.new(tag)
data/lib/marc/record.rb CHANGED
@@ -1,3 +1,6 @@
1
+ require 'marc/controlfield'
2
+ require 'marc/datafield'
3
+
1
4
  module MARC
2
5
 
3
6
  # A class that represents an individual MARC record. Every record
@@ -118,11 +121,40 @@ module MARC
118
121
  return MARC::DublinCore.map(self)
119
122
  end
120
123
 
124
+ # Return a marc-hash version of the record
125
+ def to_marchash
126
+ return {
127
+ 'type' => 'marc-hash',
128
+ 'version' => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION],
129
+ 'leader' => self.leader,
130
+ 'fields' => self.map {|f| f.to_marchash}
131
+ }
132
+ end #to_hash
133
+
134
+ # Factory method for creating a new MARC::Record from
135
+ # a marchash object
136
+ #
137
+ # record = MARC::Record->new_from_marchash(mh)
138
+
139
+ def self.new_from_marchash(mh)
140
+ r = self.new()
141
+ r.leader = mh['leader']
142
+ mh['fields'].each do |f|
143
+ if (f.length == 2)
144
+ r << MARC::ControlField.new(f[0], f[1])
145
+ elsif
146
+ r << MARC::DataField.new(f[0], f[1], f[2], *f[3])
147
+ end
148
+ end
149
+ return r
150
+ end
151
+
152
+
121
153
  # Returns a string version of the record, suitable for printing
122
154
 
123
155
  def to_s
124
156
  str = "LEADER #{leader}\n"
125
- for field in fields:
157
+ for field in fields
126
158
  str += field.to_s() + "\n"
127
159
  end
128
160
  return str
@@ -0,0 +1,288 @@
1
+ module MARC
2
+ # The MagicReader will try to use the best available XML Parser at the
3
+ # time of initialization.
4
+ # The order is currently:
5
+ # * Nokogiri
6
+ # * jrexml (JRuby only)
7
+ # * rexml
8
+ #
9
+ # With the idea that other parsers could be added as their modules are
10
+ # added. Realistically, this list should be limited to stream-based
11
+ # parsers. The magic should be used selectively, however. After all,
12
+ # one project's definition of 'best' might not apply universally. It
13
+ # is arguable which is "best" on JRuby: Nokogiri or jrexml.
14
+ module MagicReader
15
+ def self.extended(receiver)
16
+ # Start with a Nokogiri check
17
+ begin
18
+ require 'nokogiri'
19
+ receiver.extend(NokogiriReader)
20
+ rescue LoadError
21
+ if RUBY_PLATFORM =~ /java/
22
+ # If using JRuby, use JREXML if it's there
23
+ begin
24
+ receiver.extend(JREXMLReader)
25
+ return
26
+ rescue LoadError
27
+ end
28
+ end
29
+ # If you're here, you're stuck with lowly REXML
30
+ receiver.extend(REXMLReader)
31
+ end
32
+ end
33
+ end
34
+
35
+ # NokogiriReader uses the Nokogiri SAX Parser to quickly read
36
+ # a MARCXML document. Because dynamically subclassing MARC::XMLReader
37
+ # is a little ugly, we need to recreate all of the SAX event methods
38
+ # from Nokogiri::XML::SAX::Document here rather than subclassing.
39
+ module NokogiriReader
40
+ def self.extended(receiver)
41
+ require 'nokogiri'
42
+ receiver.init
43
+ end
44
+
45
+ # Sets our instance variables for SAX parsing in Nokogiri and parser
46
+ def init
47
+ @record = {:record=>nil,:field=>nil,:subfield=>nil}
48
+ @current_element = nil
49
+ @ns = "http://www.loc.gov/MARC21/slim"
50
+ @parser = Nokogiri::XML::SAX::Parser.new(self)
51
+ end
52
+
53
+ # Loop through the MARC records in the XML document
54
+ def each(&block)
55
+ @block = block
56
+ @parser.parse(@handle)
57
+ end
58
+
59
+ # Returns our MARC::Record object to the #each block.
60
+ def yield_record
61
+ @block.call(@record[:record])
62
+ @record[:record] = nil
63
+ end
64
+
65
+ def start_element_namespace name, attributes = [], prefix = nil, uri = nil, ns = {}
66
+ attributes = attributes_to_hash(attributes)
67
+ if uri == @ns
68
+ case name.downcase
69
+ when 'record' then @record[:record] = MARC::Record.new
70
+ when 'leader' then @current_element = :leader
71
+ when 'controlfield'
72
+ @current_element=:field
73
+ @record[:field] = MARC::ControlField.new(attributes["tag"])
74
+ when 'datafield'
75
+ @record[:field] = MARC::DataField.new(attributes["tag"], attributes['ind1'], attributes['ind2'])
76
+ when 'subfield'
77
+ @current_element=:subfield
78
+ @record[:subfield] = MARC::Subfield.new(attributes['code'])
79
+ end
80
+ end
81
+ end
82
+
83
+ def characters text
84
+ case @current_element
85
+ when :leader then @record[:record].leader = text
86
+ when :field then @record[:field].value << text
87
+ when :subfield then @record[:subfield].value << text
88
+ end
89
+ end
90
+
91
+ def end_element_namespace name, prefix = nil, uri = nil
92
+ @current_element = nil
93
+ if uri == "http://www.loc.gov/MARC21/slim"
94
+ case name.downcase
95
+ when 'record' then yield_record
96
+ when /(control|data)field/
97
+ @record[:record] << @record[:field]
98
+ @record[:field] = nil
99
+ @current_element = nil if @current_element == :field
100
+ when 'subfield'
101
+ @record[:field].append(@record[:subfield])
102
+ @record[:subfield] = nil
103
+ @current_element = nil if @current_element == :subfield
104
+ end
105
+ end
106
+ end
107
+
108
+ def method_missing(methName, *args)
109
+ sax_methods = [:xmldecl, :start_document, :end_document, :start_element,
110
+ :end_element, :comment, :warning, :error, :cdata_block]
111
+ unless sax_methods.index(methName)
112
+ raise NoMethodError.new("undefined method '#{methName} for #{self}", 'no_meth')
113
+ end
114
+ end
115
+
116
+ private
117
+
118
+ def attributes_to_hash(attributes)
119
+ hash = {}
120
+ attributes.each do | att |
121
+ hash[att.localname] = att.value
122
+ end
123
+ hash
124
+ end
125
+ end
126
+
127
+ # The REXMLReader is the 'default' parser, since we can at least be
128
+ # assured that REXML is probably there. It uses REXML's PullParser
129
+ # to handle larger document sizes without consuming insane amounts of
130
+ # memory, but it's still REXML (read: slow), so it's a good idea to
131
+ # use an alternative parser if available. If you don't know the best
132
+ # parser available, you can use the MagicReader or set:
133
+ #
134
+ # MARC::XMLReader.parser=MARC::XMLReader::USE_BEST_AVAILABLE
135
+ #
136
+ # or
137
+ #
138
+ # MARC::XMLReader.parser="magic"
139
+ #
140
+ # or
141
+ #
142
+ # reader = MARC::XMLReader.new(fh, :parser=>"magic")
143
+ # (or the constant)
144
+ #
145
+ # which will cascade down to REXML if nothing better is found.
146
+ #
147
+ module REXMLReader
148
+ def self.extended(receiver)
149
+ require 'rexml/document'
150
+ require 'rexml/parsers/pullparser'
151
+ receiver.init
152
+ end
153
+
154
+ # Sets our parser
155
+ def init
156
+ @parser = REXML::Parsers::PullParser.new(@handle)
157
+ end
158
+
159
+ # Loop through the MARC records in the XML document
160
+ def each
161
+ while @parser.has_next?
162
+ event = @parser.pull
163
+ # if it's the start of a record element
164
+ if event.start_element? and strip_ns(event[0]) == 'record'
165
+ yield build_record
166
+ end
167
+ end
168
+ end
169
+
170
+ private
171
+ def strip_ns(str)
172
+ return str.sub(/^.*:/, '')
173
+ end
174
+
175
+ # will accept parse events until a record has been built up
176
+ #
177
+ def build_record
178
+ record = MARC::Record.new
179
+ data_field = nil
180
+ control_field = nil
181
+ subfield = nil
182
+ text = ''
183
+ attrs = nil
184
+ if Module.constants.index('Nokogiri') and @parser.is_a?(Nokogiri::XML::Reader)
185
+ datafield = nil
186
+ cursor = nil
187
+ open_elements = []
188
+ @parser.each do | node |
189
+ if node.value? && cursor
190
+ if cursor.is_a?(Symbol) and cursor == :leader
191
+ record.leader = node.value
192
+ else
193
+ cursor.value = node.value
194
+ end
195
+ cursor = nil
196
+ end
197
+ next unless node.namespace_uri == @ns
198
+ if open_elements.index(node.local_name.downcase)
199
+ open_elements.delete(node.local_name.downcase)
200
+ next
201
+ else
202
+ open_elements << node.local_name.downcase
203
+ end
204
+ case node.local_name.downcase
205
+ when "leader"
206
+ cursor = :leader
207
+ when "controlfield"
208
+ record << datafield if datafield
209
+ datafield = nil
210
+ control_field = MARC::ControlField.new(node.attribute('tag'))
211
+ record << control_field
212
+ cursor = control_field
213
+ when "datafield"
214
+ record << datafield if datafield
215
+ datafield = nil
216
+ data_field = MARC::DataField.new(node.attribute('tag'), node.attribute('ind1'), node.attribute('ind2'))
217
+ datafield = data_field
218
+ when "subfield"
219
+ raise "No datafield to add to" unless datafield
220
+ subfield = MARC::Subfield.new(node.attribute('code'))
221
+ datafield.append(subfield)
222
+ cursor = subfield
223
+ when "record"
224
+ record << datafield if datafield
225
+ return record
226
+ end
227
+ #puts node.name
228
+ end
229
+
230
+ else
231
+ while @parser.has_next?
232
+ event = @parser.pull
233
+
234
+ if event.text?
235
+ text += REXML::Text::unnormalize(event[0])
236
+ next
237
+ end
238
+
239
+ if event.start_element?
240
+ text = ''
241
+ attrs = event[1]
242
+ case strip_ns(event[0])
243
+ when 'controlfield'
244
+ text = ''
245
+ control_field = MARC::ControlField.new(attrs['tag'])
246
+ when 'datafield'
247
+ text = ''
248
+ data_field = MARC::DataField.new(attrs['tag'], attrs['ind1'],
249
+ attrs['ind2'])
250
+ when 'subfield'
251
+ text = ''
252
+ subfield = MARC::Subfield.new(attrs['code'])
253
+ end
254
+ end
255
+
256
+ if event.end_element?
257
+ case strip_ns(event[0])
258
+ when 'leader'
259
+ record.leader = text
260
+ when 'record'
261
+ return record
262
+ when 'controlfield'
263
+ control_field.value = text
264
+ record.append(control_field)
265
+ when 'datafield'
266
+ record.append(data_field)
267
+ when 'subfield'
268
+ subfield.value = text
269
+ data_field.append(subfield)
270
+ end
271
+ end
272
+ end
273
+ end
274
+ end
275
+ end
276
+
277
+ # The JREXMLReader is really just here to set the load order for
278
+ # injecting the Java pull parser.
279
+ module JREXMLReader
280
+
281
+ def self.extended(receiver)
282
+ require 'rexml/document'
283
+ require 'rexml/parsers/pullparser'
284
+ require 'jrexml'
285
+ receiver.extend(REXMLReader)
286
+ end
287
+ end
288
+ end
@@ -1,103 +1,142 @@
1
- require 'rexml/document'
2
- require 'rexml/parsers/pullparser'
3
-
1
+ require File.dirname(__FILE__) + '/xml_parsers'
4
2
  module MARC
5
-
3
+
4
+ # the constructor which you can pass either a filename:
5
+ #
6
+ # reader = MARC::XMLReader.new('/Users/edsu/marc.xml')
7
+ #
8
+ # or a File object,
9
+ #
10
+ # reader = Marc::XMLReader.new(File.new('/Users/edsu/marc.xml'))
11
+ #
12
+ # or really any object that responds to read(n)
13
+ #
14
+ # reader = MARC::XMLReader.new(StringIO.new(xml))
15
+ #
16
+ # By default, XMLReader uses REXML's pull parser, but you can swap
17
+ # that out with Nokogiri or jrexml (or let the system choose the
18
+ # 'best' one). The :parser can either be one of the defined constants
19
+ # or the constant's value.
20
+ #
21
+ # reader = MARC::XMLReader.new(fh, :parser=>'magic')
22
+ #
23
+ # It is also possible to set the default parser at the class level so
24
+ # all subsequent instances will use it instead:
25
+ #
26
+ # MARC::XMLReader.best_available
27
+ # "nokogiri" # returns parser name, but doesn't set it.
28
+ #
29
+ # Use:
30
+ # MARC::XMLReader.best_available!
31
+ #
32
+ # or
33
+ # MARC::XMLReader.nokogiri!
34
+ #
6
35
  class XMLReader
7
36
  include Enumerable
8
-
9
- # the constructor which you can pass either a filename:
10
- #
11
- # reader = MARC::XMLReader.new('/Users/edsu/marc.xml')
12
- #
13
- # or a File object,
14
- #
15
- # reader = Marc::XMLReader.new(File.new('/Users/edsu/marc.xml'))
16
- #
17
- # or really any object that responds to read(n)
18
- #
19
- # reader = MARC::XMLReader.new(StringIO.new(xml))
37
+ USE_BEST_AVAILABLE = 'magic'
38
+ USE_REXML = 'rexml'
39
+ USE_NOKOGIRI = 'nokogiri'
40
+ USE_JREXML = 'jrexml'
41
+ @@parser = USE_REXML
42
+ attr_reader :parser
20
43
 
21
- def initialize(file)
22
- if file.class == String:
44
+ def initialize(file, options = {})
45
+ if file.is_a?(String)
23
46
  handle = File.new(file)
24
47
  elsif file.respond_to?("read", 5)
25
48
  handle = file
26
49
  else
27
50
  throw "must pass in path or File"
28
51
  end
52
+ @handle = handle
29
53
 
30
- @parser = REXML::Parsers::PullParser.new(handle)
31
- end
32
-
33
- def each
34
- while @parser.has_next?
35
- event = @parser.pull
36
- # if it's the start of a record element
37
- if event.start_element? and strip_ns(event[0]) == 'record'
38
- yield build_record
39
- end
54
+ if options[:parser]
55
+ parser = self.class.choose_parser(options[:parser].to_s)
56
+ else
57
+ parser = @@parser
58
+ end
59
+ case parser
60
+ when 'magic' then extend MagicReader
61
+ when 'rexml' then extend REXMLReader
62
+ when 'jrexml' then extend JREXMLReader
63
+ when 'nokogiri' then extend NokogiriReader
40
64
  end
41
65
  end
42
66
 
43
- private
44
-
45
- def strip_ns(str)
46
- return str.sub(/^.*:/, '')
67
+ # Returns the currently set parser type
68
+ def self.parser
69
+ return @@parser
70
+ end
71
+
72
+ # Returns an array of all the parsers available
73
+ def self.parsers
74
+ p = []
75
+ self.constants.each do | const |
76
+ next unless const.match("^USE_")
77
+ p << const
78
+ end
79
+ return p
80
+ end
81
+
82
+ # Sets the class parser
83
+ def self.parser=(p)
84
+ @@parser = choose_parser(p)
47
85
  end
48
86
 
49
- # will accept parse events until a record has been built up
50
- #
51
- def build_record
52
- record = MARC::Record.new
53
- data_field = nil
54
- control_field = nil
55
- subfield = nil
56
- text = ''
57
- attrs = nil
58
-
59
- while @parser.has_next?
60
- event = @parser.pull
61
-
62
- if event.text?
63
- text += REXML::Text::unnormalize(event[0])
64
- next
65
- end
66
-
67
- if event.start_element?
68
- text = ''
69
- attrs = event[1]
70
- case strip_ns(event[0])
71
- when 'controlfield'
72
- text = ''
73
- control_field = MARC::ControlField.new(attrs['tag'])
74
- when 'datafield'
75
- text = ''
76
- data_field = MARC::DataField.new(attrs['tag'], attrs['ind1'],
77
- attrs['ind2'])
78
- when 'subfield'
79
- text = ''
80
- subfield = MARC::Subfield.new(attrs['code'])
87
+ # Returns the value of the best available parser
88
+ def self.best_available
89
+ parser = nil
90
+ begin
91
+ require 'nokogiri'
92
+ parser = USE_NOKOGIRI
93
+ rescue LoadError
94
+ if RUBY_PLATFORM =~ /java/
95
+ begin
96
+ require 'jrexml'
97
+ parser = USE_JREXML
98
+ rescue LoadError
99
+ parser = USE_REXML
81
100
  end
101
+ else
102
+ parser = USE_REXML
82
103
  end
83
-
84
- if event.end_element?
85
- case strip_ns(event[0])
86
- when 'leader'
87
- record.leader = text
88
- when 'record'
89
- return record
90
- when 'controlfield'
91
- control_field.value = text
92
- record.append(control_field)
93
- when 'datafield'
94
- record.append(data_field)
95
- when 'subfield'
96
- subfield.value = text
97
- data_field.append(subfield)
98
- end
104
+ parser
105
+ end
106
+ end
107
+
108
+ # Sets the best available parser as the default
109
+ def self.best_available!
110
+ @@parser = self.best_available
111
+ end
112
+
113
+ # Sets Nokogiri as the default parser
114
+ def self.nokogiri!
115
+ @@parser = USE_NOKOGIRI
116
+ end
117
+
118
+ # Sets jrexml as the default parser
119
+ def self.jrexml!
120
+ @@parser = USE_JREXML
121
+ end
122
+
123
+ # Sets REXML as the default parser
124
+ def self.rexml!
125
+ @@parser = USE_REXML
126
+ end
127
+
128
+ protected
129
+
130
+ def self.choose_parser(p)
131
+ match = false
132
+ self.constants.each do | const |
133
+ next unless const.to_s.match("^USE_")
134
+ if self.const_get(const) == p
135
+ match = true
136
+ return p
99
137
  end
100
138
  end
139
+ raise ArgumentError.new("Parser '#{p}' not defined") unless match
101
140
  end
102
141
  end
103
142
  end
@@ -137,7 +137,7 @@ module MARC
137
137
  control_element = REXML::Element.new("controlfield")
138
138
 
139
139
  # We need a marker for invalid tag values (we use 000)
140
- unless field.tag.match(ctrlFieldTag)
140
+ unless field.tag.match(ctrlFieldTag) or MARC::Field.control_tag?(ctrlFieldTag)
141
141
  field.tag = "00z"
142
142
  end
143
143
 
@@ -15,6 +15,31 @@ class TestField < Test::Unit::TestCase
15
15
  end
16
16
  end
17
17
 
18
+ def test_alpha_control_field
19
+ assert_raise(MARC::Exception) do
20
+ # can't have a field with a tag < 010
21
+ field = MARC::ControlField.new('DDD')
22
+ end
23
+ end
24
+
25
+ def test_extra_control_field
26
+ MARC::ControlField.control_tags << 'FMT'
27
+ assert_nothing_raised do
28
+ field = MARC::ControlField.new('FMT')
29
+ end
30
+ assert_raise(MARC::Exception) do
31
+ field = MARC::DataField.new('FMT')
32
+ end
33
+ MARC::ControlField.control_tags.delete('FMT')
34
+ assert_nothing_raised do
35
+ field = MARC::DataField.new('FMT')
36
+ end
37
+ assert_raise(MARC::Exception) do
38
+ field = MARC::ControlField.new('FMT')
39
+ end
40
+
41
+ end
42
+
18
43
  def test_control_as_field
19
44
  assert_raise(MARC::Exception) do
20
45
  # can't have a control with a tag > 009
@@ -0,0 +1,37 @@
1
+ require 'test/unit'
2
+ require 'marc'
3
+ require 'rubygems'
4
+
5
+ class TestMARCHASH < Test::Unit::TestCase
6
+
7
+ def test_simple
8
+ simple = {
9
+ 'type' => 'marc-hash',
10
+ 'version' => [1,0],
11
+ 'leader' => 'LEADER',
12
+ 'fields' => [
13
+ ['245', '1', '0',
14
+ [
15
+ ['a', 'TITLE'],
16
+ ['b', 'SUBTITLE']
17
+ ]
18
+ ]
19
+ ]
20
+ }
21
+ r = MARC::Record.new()
22
+ r.leader = 'LEADER'
23
+ f = MARC::DataField.new('245', '1', '0', ['a', 'TITLE'], ['b', 'SUBTITLE'])
24
+ r << f
25
+ assert_equal(r.to_marchash, simple)
26
+ end
27
+
28
+ def test_real
29
+ reader = MARC::Reader.new('test/batch.dat')
30
+ reader.each do |r|
31
+ x = MARC::Record.new_from_marchash(r.to_marchash)
32
+ assert_equal(r,x)
33
+ end
34
+ end
35
+
36
+
37
+ end
@@ -0,0 +1,154 @@
1
+ require 'test/unit'
2
+ require 'marc'
3
+
4
+ class ParsersTest < Test::Unit::TestCase
5
+ def test_parser_default
6
+ assert_equal("rexml", MARC::XMLReader.parser)
7
+ reader = MARC::XMLReader.new('test/one.xml')
8
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
9
+ end
10
+
11
+ def test_set_nokogiri
12
+ begin
13
+ require 'nokogiri'
14
+ assert_equal("rexml", MARC::XMLReader.parser)
15
+ reader = MARC::XMLReader.new('test/one.xml')
16
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
17
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>MARC::XMLReader::USE_NOKOGIRI)
18
+ assert_kind_of(Nokogiri::XML::SAX::Parser, reader.parser)
19
+ assert_equal("rexml", MARC::XMLReader.parser)
20
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>'nokogiri')
21
+ assert_kind_of(Nokogiri::XML::SAX::Parser, reader.parser)
22
+ assert_equal("rexml", MARC::XMLReader.parser)
23
+ MARC::XMLReader.parser=MARC::XMLReader::USE_NOKOGIRI
24
+ assert_equal("nokogiri", MARC::XMLReader.parser)
25
+ reader = MARC::XMLReader.new('test/one.xml')
26
+ assert_kind_of(Nokogiri::XML::SAX::Parser, reader.parser)
27
+ MARC::XMLReader.parser="nokogiri"
28
+ assert_equal("nokogiri", MARC::XMLReader.parser)
29
+ reader = MARC::XMLReader.new('test/one.xml')
30
+ assert_kind_of(Nokogiri::XML::SAX::Parser, reader.parser)
31
+ rescue LoadError
32
+ puts "\nNokogiri not available, skipping 'test_set_nokogiri'.\n"
33
+ end
34
+ end
35
+
36
+ def test_set_jrexml
37
+ if RUBY_PLATFORM =~ /java/
38
+ begin
39
+ require 'jrexml'
40
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>MARC::XMLReader::USE_JREXML)
41
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
42
+ assert_equal("rexml", MARC::XMLReader.parser)
43
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>'jrexml')
44
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
45
+ assert_equal("rexml", MARC::XMLReader.parser)
46
+ MARC::XMLReader.parser=MARC::XMLReader::USE_JREXML
47
+ assert_equal("jrexml", MARC::XMLReader.parser)
48
+ reader = MARC::XMLReader.new('test/one.xml')
49
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
50
+ MARC::XMLReader.parser="jrexml"
51
+ assert_equal("jrexml", MARC::XMLReader.parser)
52
+ reader = MARC::XMLReader.new('test/one.xml')
53
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
54
+ rescue LoadError
55
+ puts "\njrexml not available, skipping 'test_set_jrexml'.\n"
56
+ end
57
+ else
58
+ puts "\nTest not being run from JRuby, skipping 'test_set_jrexml'.\n"
59
+ end
60
+ end
61
+
62
+ def test_set_rexml
63
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>MARC::XMLReader::USE_REXML)
64
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
65
+ assert_equal("rexml", MARC::XMLReader.parser)
66
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>'rexml')
67
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
68
+ assert_equal("rexml", MARC::XMLReader.parser)
69
+ MARC::XMLReader.parser=MARC::XMLReader::USE_REXML
70
+ assert_equal("rexml", MARC::XMLReader.parser)
71
+ reader = MARC::XMLReader.new('test/one.xml')
72
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
73
+ MARC::XMLReader.parser="rexml"
74
+ assert_equal("rexml", MARC::XMLReader.parser)
75
+ reader = MARC::XMLReader.new('test/one.xml')
76
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
77
+ end
78
+
79
+ def test_set_magic
80
+ magic_parser = nil
81
+ begin
82
+ require 'nokogiri'
83
+ magic_parser = Nokogiri::XML::SAX::Parser
84
+ rescue LoadError
85
+ magic_parser = REXML::Parsers::PullParser
86
+ end
87
+ puts "\nTesting 'test_set_magic' for parser: #{magic_parser}"
88
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>MARC::XMLReader::USE_BEST_AVAILABLE)
89
+ assert_kind_of(magic_parser, reader.parser)
90
+ assert_equal("rexml", MARC::XMLReader.parser)
91
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>'magic')
92
+ assert_kind_of(magic_parser, reader.parser)
93
+ assert_equal("rexml", MARC::XMLReader.parser)
94
+ MARC::XMLReader.parser=MARC::XMLReader::USE_BEST_AVAILABLE
95
+ assert_equal("magic", MARC::XMLReader.parser)
96
+ reader = MARC::XMLReader.new('test/one.xml')
97
+ assert_kind_of(magic_parser, reader.parser)
98
+ MARC::XMLReader.parser="magic"
99
+ assert_equal("magic", MARC::XMLReader.parser)
100
+ reader = MARC::XMLReader.new('test/one.xml')
101
+ assert_kind_of(magic_parser, reader.parser)
102
+ end
103
+
104
+ def test_parser_set_convenience_methods
105
+ parser_name = nil
106
+ parser = nil
107
+ begin
108
+ require 'nokogiri'
109
+ parser_name = 'nokogiri'
110
+ parser = Nokogiri::XML::SAX::Parser
111
+ rescue LoadError
112
+ parser = REXML::Parsers::PullParser
113
+ parser = 'rexml'
114
+ if RUBY_PLATFORM =~ /java/
115
+ begin
116
+ require 'jrexml'
117
+ parser_name = 'jrexml'
118
+ rescue LoadError
119
+ end
120
+ end
121
+ end
122
+ assert_equal(parser_name, MARC::XMLReader.best_available)
123
+ MARC::XMLReader.best_available!
124
+ reader = MARC::XMLReader.new('test/one.xml')
125
+ assert_kind_of(parser, reader.parser)
126
+ MARC::XMLReader.rexml!
127
+ reader = MARC::XMLReader.new('test/one.xml')
128
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
129
+ if parser_name == 'nokogiri'
130
+ MARC::XMLReader.nokogiri!
131
+ reader = MARC::XMLReader.new('test/one.xml')
132
+ assert_kind_of(Nokogiri::XML::SAX::Parser, reader.parser)
133
+ else
134
+ puts "\nNokogiri not loaded, skipping convenience method test.\n"
135
+ end
136
+ if RUBY_PLATFORM =~ /java/
137
+ begin
138
+ require 'jrexml'
139
+ MARC::XMLReader.jrexml!
140
+ reader = MARC::XMLReader.new('test/one.xml')
141
+ assert_kind_of(REXML::Parsers::PullParser, reader.parser)
142
+ rescue LoadError
143
+ puts "\njrexml not available, skipping convenience method test.\n"
144
+ end
145
+ else
146
+ puts "\nTest not being run from JRuby, skipping jrexml convenience method test.\n"
147
+ end
148
+ end
149
+
150
+ def teardown
151
+ MARC::XMLReader.parser=MARC::XMLReader::USE_REXML
152
+ end
153
+
154
+ end
data/test/tc_record.rb CHANGED
@@ -12,7 +12,12 @@ class TestRecord < Test::Unit::TestCase
12
12
  r = get_record()
13
13
  doc = r.to_xml
14
14
  assert_kind_of REXML::Element, doc
15
- assert_equal "<record xmlns='http://www.loc.gov/MARC21/slim'><leader> Z 22 4500</leader><datafield tag='100' ind1='2' ind2='0'><subfield code='a'>Thomas, Dave</subfield></datafield><datafield tag='245' ind1='0' ind2='4'><subfield code='The Pragmatic Programmer'></subfield></datafield></record>", doc.to_s
15
+ if RUBY_VERSION < '1.9.0'
16
+ assert_equal "<record xmlns='http://www.loc.gov/MARC21/slim'><leader> Z 22 4500</leader><datafield tag='100' ind1='2' ind2='0'><subfield code='a'>Thomas, Dave</subfield></datafield><datafield tag='245' ind1='0' ind2='4'><subfield code='The Pragmatic Programmer'></subfield></datafield></record>", doc.to_s
17
+ else
18
+ # REXML inexplicably sorts the attributes alphabetically in Ruby 1.9
19
+ assert_equal "<record xmlns='http://www.loc.gov/MARC21/slim'><leader> Z 22 4500</leader><datafield ind1='2' ind2='0' tag='100'><subfield code='a'>Thomas, Dave</subfield></datafield><datafield ind1='0' ind2='4' tag='245'><subfield code='The Pragmatic Programmer'></subfield></datafield></record>", doc.to_s
20
+ end
16
21
  end
17
22
 
18
23
  def test_append_field
data/test/tc_xml.rb CHANGED
@@ -3,20 +3,50 @@ require 'marc'
3
3
  require 'stringio'
4
4
 
5
5
  class XMLTest < Test::Unit::TestCase
6
+ def setup
7
+ @parsers = [:rexml]
8
+ begin
9
+ require 'nokogiri'
10
+ @parsers << :nokogiri
11
+ rescue LoadError
12
+ end
13
+ if RUBY_PLATFORM =~ /java/
14
+ begin
15
+ require 'jrexml'
16
+ @parsers << :jrexml
17
+ rescue LoadError
18
+ end
19
+ end
20
+ end
6
21
 
7
- def test_xml_entities
22
+
23
+ def test_xml_entities
24
+ @parsers.each do | parser |
25
+ puts "\nRunning test_xml_entities with: #{parser}.\n"
26
+ xml_entities_test(parser)
27
+ end
28
+ end
29
+
30
+ def xml_entities_test(parser)
8
31
  r1 = MARC::Record.new
9
32
  r1 << MARC::DataField.new('245', '0', '0', ['a', 'foo & bar & baz'])
10
33
  xml = r1.to_xml.to_s
11
34
  assert_match /foo &amp; bar &amp; baz/, xml
12
35
 
13
- reader = MARC::XMLReader.new(StringIO.new(xml))
36
+ reader = MARC::XMLReader.new(StringIO.new(xml), :parser=>parser)
14
37
  r2 = reader.entries[0]
15
- assert_equal 'foo & bar & baz', r2['245']['a']
38
+ assert_equal 'foo & bar & baz', r2['245']['a']
16
39
  end
17
-
40
+
18
41
  def test_batch
19
- reader = MARC::XMLReader.new('test/batch.xml')
42
+ @parsers.each do | parser |
43
+ puts "\nRunning test_batch with: #{parser}.\n"
44
+ batch_test(parser)
45
+ end
46
+ end
47
+
48
+ def batch_test(parser)
49
+ reader = MARC::XMLReader.new('test/batch.xml', :parser=>parser)
20
50
  count = 0
21
51
  for record in reader
22
52
  count += 1
@@ -24,15 +54,29 @@ class XMLTest < Test::Unit::TestCase
24
54
  end
25
55
  assert_equal(count, 2)
26
56
  end
27
-
57
+
28
58
  def test_read_string
59
+ @parsers.each do | parser |
60
+ puts "\nRunning test_read_string with: #{parser}.\n"
61
+ read_string_test(parser)
62
+ end
63
+ end
64
+
65
+ def read_string_test(parser)
29
66
  xml = File.new('test/batch.xml').read
30
- reader = MARC::XMLReader.new(StringIO.new(xml))
67
+ reader = MARC::XMLReader.new(StringIO.new(xml), :parser=>parser)
31
68
  assert_equal 2, reader.entries.length
32
69
  end
33
70
 
34
71
  def test_non_numeric_fields
35
- reader = MARC::XMLReader.new('test/non-numeric.xml')
72
+ @parsers.each do | parser |
73
+ puts "\nRunning test_non_numeric_fields with: #{parser}.\n"
74
+ non_numeric_fields_test(parser)
75
+ end
76
+ end
77
+
78
+ def non_numeric_fields_test(parser)
79
+ reader = MARC::XMLReader.new('test/non-numeric.xml', :parser=>parser)
36
80
  count = 0
37
81
  record = nil
38
82
  reader.each do | rec |
@@ -45,21 +89,42 @@ class XMLTest < Test::Unit::TestCase
45
89
  end
46
90
 
47
91
  def test_read_no_leading_zero_write_leading_zero
48
- reader = MARC::XMLReader.new('test/no-leading-zero.xml')
92
+ @parsers.each do | parser |
93
+ puts "\nRunning test_read_no_leading_zero_write_leading_zero with: #{parser}.\n"
94
+ read_no_leading_zero_write_leading_zero_test(parser)
95
+ end
96
+ end
97
+
98
+ def read_no_leading_zero_write_leading_zero_test(parser)
99
+ reader = MARC::XMLReader.new('test/no-leading-zero.xml', :parser=>parser)
49
100
  record = reader.to_a[0]
50
101
  assert_equal("042 zz $a dc ", record['042'].to_s)
51
102
  end
52
103
 
53
104
  def test_leader_from_xml
54
- reader = MARC::XMLReader.new('test/one.xml')
105
+ @parsers.each do | parser |
106
+ puts "\nRunning test_leader_from_xml with: #{parser}.\n"
107
+ leader_from_xml_test(parser)
108
+ end
109
+ end
110
+
111
+ def leader_from_xml_test(parser)
112
+ reader = MARC::XMLReader.new('test/one.xml', :parser=>parser)
55
113
  record = reader.entries[0]
56
114
  assert_equal ' njm a22 uu 4500', record.leader
57
115
  # serializing as MARC should populate the record length and directory offset
58
116
  record = MARC::Record.new_from_marc(record.to_marc)
59
117
  assert_equal '00734njm a2200217uu 4500', record.leader
60
118
  end
61
-
119
+
62
120
  def test_read_write
121
+ @parsers.each do | parser |
122
+ puts "\nRunning test_read_write with: #{parser}.\n"
123
+ read_write_test(parser)
124
+ end
125
+ end
126
+
127
+ def read_write_test(parser)
63
128
  record1 = MARC::Record.new
64
129
  record1.leader = '00925njm 22002777a 4500'
65
130
  record1.append MARC::ControlField.new('007', 'sdubumennmplu')
@@ -74,7 +139,7 @@ class XMLTest < Test::Unit::TestCase
74
139
  assert_match /<controlfield tag='007'>sdubumennmplu<\/controlfield>/, xml
75
140
  assert_match /<\?xml-stylesheet type="text\/xsl" href="style.xsl"\?>/, xml
76
141
 
77
- reader = MARC::XMLReader.new('test/test.xml')
142
+ reader = MARC::XMLReader.new('test/test.xml', :parser=>parser)
78
143
  record2 = reader.entries[0]
79
144
  assert_equal(record1, record2)
80
145
 
metadata CHANGED
@@ -1,15 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: marc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
+ - Kevin Clarke
8
+ - William Groppe
9
+ - Ross Singer
7
10
  - Ed Summers
8
11
  autorequire: marc
9
12
  bindir: bin
10
13
  cert_chain: []
11
14
 
12
- date: 2008-12-30 00:00:00 -05:00
15
+ date: 2009-09-23 00:00:00 -04:00
13
16
  default_executable:
14
17
  dependencies: []
15
18
 
@@ -22,41 +25,45 @@ extensions: []
22
25
  extra_rdoc_files: []
23
26
 
24
27
  files:
25
- - lib/marc.rb
26
- - lib/marc
27
- - lib/marc/xmlwriter.rb
28
+ - lib/marc/constants.rb
28
29
  - lib/marc/controlfield.rb
29
- - lib/marc/xmlreader.rb
30
+ - lib/marc/datafield.rb
31
+ - lib/marc/dublincore.rb
32
+ - lib/marc/exception.rb
30
33
  - lib/marc/reader.rb
31
34
  - lib/marc/record.rb
32
- - lib/marc/exception.rb
33
- - lib/marc/datafield.rb
34
35
  - lib/marc/subfield.rb
35
- - lib/marc/constants.rb
36
- - lib/marc/dublincore.rb
37
36
  - lib/marc/writer.rb
37
+ - lib/marc/xml_parsers.rb
38
+ - lib/marc/xmlreader.rb
39
+ - lib/marc/xmlwriter.rb
40
+ - lib/marc.rb
41
+ - test/batch.dat
42
+ - test/batch.xml
43
+ - test/no-leading-zero.xml
38
44
  - test/non-numeric.dat
39
- - test/tc_dublincore.rb
40
- - test/tc_datafield.rb
41
45
  - test/non-numeric.xml
42
- - test/no-leading-zero.xml
43
- - test/ts_marc.rb
44
- - test/tc_writer.rb
45
- - test/batch.xml
46
- - test/tc_xml.rb
47
46
  - test/one.dat
48
- - test/tc_record.rb
49
47
  - test/one.xml
50
- - test/batch.dat
51
48
  - test/tc_controlfield.rb
49
+ - test/tc_datafield.rb
50
+ - test/tc_dublincore.rb
51
+ - test/tc_marchash.rb
52
+ - test/tc_parsers.rb
52
53
  - test/tc_reader.rb
54
+ - test/tc_record.rb
53
55
  - test/tc_subfield.rb
56
+ - test/tc_writer.rb
57
+ - test/tc_xml.rb
58
+ - test/ts_marc.rb
54
59
  - Rakefile
55
60
  - README
56
61
  - Changes
57
62
  - LICENSE
58
63
  has_rdoc: true
59
- homepage: http://www.textualize.com/ruby_marc
64
+ homepage: http://marc.rubyforge.org/
65
+ licenses: []
66
+
60
67
  post_install_message:
61
68
  rdoc_options: []
62
69
 
@@ -77,9 +84,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
77
84
  requirements: []
78
85
 
79
86
  rubyforge_project:
80
- rubygems_version: 1.3.1
87
+ rubygems_version: 1.3.5
81
88
  signing_key:
82
- specification_version: 2
89
+ specification_version: 3
83
90
  summary: A ruby library for working with Machine Readable Cataloging
84
91
  test_files:
85
92
  - test/ts_marc.rb