marc4j4r 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/README.rdoc +17 -0
  2. data/Rakefile +26 -13
  3. data/VERSION +1 -1
  4. data/jars/marc4j.jar +0 -0
  5. data/lib/marc4j4r/controlfield.rb +32 -0
  6. data/lib/marc4j4r/datafield.rb +196 -0
  7. data/lib/marc4j4r/reader.rb +71 -0
  8. data/lib/marc4j4r/record.rb +214 -0
  9. data/lib/marc4j4r/writer.rb +29 -0
  10. data/lib/marc4j4r.rb +26 -485
  11. data/lib/original_monolithic_file.rb +518 -0
  12. data/spec/batch.dat +1 -0
  13. data/spec/batch.txt +193 -0
  14. data/spec/batch.xml +13 -0
  15. data/spec/controlfield_spec.rb +40 -0
  16. data/spec/datafield_spec.rb +56 -0
  17. data/spec/one.dat +1 -0
  18. data/spec/one.txt +17 -0
  19. data/spec/one.xml +4 -0
  20. data/spec/reader_spec.rb +49 -0
  21. data/spec/record_spec.rb +101 -0
  22. data/{test/helper.rb → spec/spec_helper.rb} +9 -5
  23. metadata +74 -80
  24. data/.document +0 -5
  25. data/.gitignore +0 -21
  26. data/README.markdown +0 -41
  27. data/doc/ControlFieldImpl.html +0 -314
  28. data/doc/DataFieldImpl.html +0 -875
  29. data/doc/Java/OrgMarc4j/MarcReader.html +0 -184
  30. data/doc/MARC4J4R/Reader.html +0 -245
  31. data/doc/MARC4J4R.html +0 -281
  32. data/doc/RecordImpl.html +0 -686
  33. data/doc/SubfieldImpl.html +0 -252
  34. data/doc/_index.html +0 -153
  35. data/doc/class_list.html +0 -36
  36. data/doc/css/common.css +0 -1
  37. data/doc/css/full_list.css +0 -50
  38. data/doc/css/style.css +0 -268
  39. data/doc/file.README.html +0 -90
  40. data/doc/file_list.html +0 -38
  41. data/doc/frames.html +0 -13
  42. data/doc/index.html +0 -90
  43. data/doc/js/app.js +0 -99
  44. data/doc/js/full_list.js +0 -106
  45. data/doc/js/jquery.js +0 -19
  46. data/doc/method_list.html +0 -219
  47. data/doc/top-level-namespace.html +0 -87
  48. data/jars/MarcImporter.jar +0 -0
  49. data/test/batch.seq +0 -118
  50. data/test/bench.rb +0 -63
  51. data/test/one.dat +0 -1
  52. data/test/one.seq +0 -30
  53. data/test/one.xml +0 -55
  54. data/test/test_marc4j4r.rb +0 -76
data/lib/marc4j4r.rb CHANGED
@@ -9,504 +9,45 @@ rescue NameError => e
9
9
  require "#{jardir}/marc4j.jar"
10
10
  end
11
11
 
12
- require 'set'
12
+ # Define a method that will take a string (filename), IO object, or StringIO object,
13
+ # and return an inputstream/outputstream
13
14
 
14
-
15
- # Re-open the MarcReader interface, define #each and include Enumerable
16
- #
17
- # We also automatically call #hashify on the records that stream through
18
- # #each in order to speed up RecordImpl#[] when (a) doing many operations on a single
19
- # record, and (b) we're not worried about interleaved tags (e.g., a 520 followed by a 510 followed
20
- # by another 520)
21
-
22
- module Java::OrgMarc4j::MarcReader
23
- include Enumerable
24
-
25
- # Return the next record, after calling #hashify on it
26
- def each(hashify=true)
27
- while self.hasNext
28
- r = self.next
29
- r.hashify if hashify
30
- yield r
31
- end
32
- end
33
- end
34
-
35
-
36
- module MARC4J4R
37
-
38
- # Add some sugar to the MarcReader interface
39
- #
40
- # Adjust the interface so that a #new call to any implementations that
41
- # implement it can take a java.io.InputStream, ruby IO obejct, or String
42
- # (that will be interpreted as a filename) without complaining.
43
- #
44
- # The mechanism -- running module_eval on a string-representation of the
45
- # new method in each of the hard-coded implementations of MarcReader
46
- # (MarcStreamReader,MarcPermissiveStreamReader,MarcXmlReader) -- is ugly
47
- # and deeply unsettling.
48
- #
49
- # @author Bill Dueber
50
- #
51
- # A string used to override the initializer for each stream reader
52
- # Need to do it this ugly way because of the way java and ruby interact;
53
- # can't just add it to the MarcReader interface the way I wanted to.
54
-
55
- NEWINIT = <<-ENDBINDER
56
- include Enumerable
57
- alias_method :oldinit, :initialize
58
- def initialize(fromwhere)
59
- stream = nil
60
- if fromwhere.is_a? Java::JavaIO::InputStream or fromwhere.is_a? Java::JavaIO::ByteArrayInputStream
61
- stream = fromwhere
62
- elsif fromwhere.is_a? IO
63
- stream = fromwhere.to_inputstream
64
- else
65
- stream = java.io.FileInputStream.new(fromwhere.to_java_string)
66
- end
67
- if self.class == Java::org.marc4j.MarcPermissiveStreamReader
68
- self.oldinit(stream, true, true)
69
- else
70
- self.oldinit(stream)
71
- end
72
- end
73
- ENDBINDER
74
-
75
- Java::org.marc4j.MarcStreamReader.module_eval(NEWINIT)
76
- Java::org.marc4j.MarcPermissiveStreamReader.module_eval(NEWINIT)
77
- Java::org.marc4j.MarcXmlReader.module_eval(NEWINIT)
78
-
79
-
80
-
81
- # Get a marc reader of the appropriate type
82
- # @param [String, IO, java.io.InputStream] input The IO stream (or filename) from which you want to read
83
- # @param [:strictmarc, :permissivemarc, :marcxml] The type of MARC reader you want.
84
- # @return [MarcReader] A MarcReader object with the syntactic sugar added in this file (e.g, each)
85
- #
86
- # @example Get a strict binary MARC reader for the file 'test.mrc'
87
- # reader = MARC4J4R.reader('test.mrc')
88
- #
89
- # @example Get a permissive binary MARC reader
90
- # reader = MARC4J4R.reader('test.mrc', :permissivemarc)
91
- #
92
- # @example Get a reader for an xml file
93
- # reader = MARC4J4R.reader('test.xml', :marcxml)
94
- #
95
- # @example Get a reader based on an existing IO object
96
- # require 'open-uri'
97
- # infile = open('http://my.machine.com/test.mrc')
98
- # reader = MARC4J4R.reader(infile)
99
-
100
- def reader(input, type = :strictmarc)
101
- case type
102
- when :strictmarc then
103
- return Java::org.marc4j.MarcStreamReader.new(input)
104
- when :permissivemarc then
105
- return Java::org.marc4j.MarcPermissiveStreamReader.new(input)
106
- when :marcxml then
107
- return Java::org.marc4j.MarcXmlReader.new(input)
108
- when :alephsequential then
109
- return MARC4J4R::AlephSequentialReader.new(input)
110
- else
111
- raise ArgumentError, "Reader type #{type} illegal: must be :strictmarc, :permissivemarc, :marcxml, or :alephsequential"
112
- end
113
- end
114
- module_function :reader
115
-
116
-
117
- # Implement an AlephSequential reader
118
- class AlephSequentialReader
119
- include Enumerable
120
- def initialize(fromwhere)
121
- stream = nil
122
- if fromwhere.is_a? Java::JavaIO::InputStream
123
- stream = fromwhere.to_io
124
- elsif fromwhere.is_a? IO
125
- stream = fromwhere
126
- else
127
- stream = File.new(fromwhere)
128
- end
129
-
130
- @handle = stream
131
- end
132
-
133
- def each
134
- record = nil
135
- currentID = nil
136
-
137
- @handle.each_line do |l|
138
- l.chomp!
139
- next unless l =~ /\S/
140
- vals = l.unpack('a9 a a3 c c a3 a*')
141
- id, tag, ind1, ind2, data = vals[0], vals[2], vals[3], vals[4], vals[6]
142
- # id, tag, ind1, ind2, junk, data = *(l.unpack('A10 a3 c c a3 A*'))
143
- if id != currentID
144
- if record
145
- yield record
146
- end
147
- record = RecordImpl.new
148
- currentID = id
149
- end
150
- if tag == 'LDR'
151
- record.setLeader(Java::org.marc4j.marc.impl.LeaderImpl.new(data))
152
- else
153
- record << buildField(tag,ind1,ind2,data)
154
- end
155
- end
156
- yield record
157
- end
158
-
159
-
160
- SUBREGEXP = /\$\$(.)/
161
- def buildField (tag, ind1, ind2, data)
162
- if Java::org.marc4j.marc.impl.Verifier.isControlField tag
163
- return Java::org.marc4j.marc.impl.ControlFieldImpl.new(tag, data)
164
- else
165
- f = Java::org.marc4j.marc.impl.DataFieldImpl.new(tag, ind1, ind2)
166
- data.split(SUBREGEXP)[1..-1].each_slice(2) do |code, value|
167
- f.addSubfield Java::org.marc4j.marc.impl.SubfieldImpl.new(code[0].ord, value)
168
- end
169
- return f
170
- end
171
- end
172
-
173
- end # End of class AlephSequentialReader
174
-
175
- end
176
-
177
-
178
-
179
-
180
- include_class Java::org.marc4j.marc.impl::RecordImpl
181
- include_class Java::org.marc4j.marc.impl::ControlFieldImpl
182
- include_class Java::org.marc4j.marc.impl::DataFieldImpl
183
- include_class Java::org.marc4j.marc.impl::SubfieldImpl
184
-
185
- # Open up RecordImpl to add some sugar, including Enumberable as well
186
- # @author Bill Dueber
187
-
188
- class RecordImpl
189
- include Enumerable
190
-
191
- alias_method :<<, :addVariableField
192
- alias_method :append, :addVariableField
193
- alias_method :fields, :getVariableFields
194
-
195
- # Export as a MARC-Hash, as described at
196
- # http://robotlibrarian.billdueber.com/marc-hash-the-saga-continues-now-with-even-less-structure/
197
- # @return A marc-hash representation of the record, suitable for calling .to_json on or whatever
198
- def to_marchash
199
- h = {}
200
- h['type'] = 'marc-hash'
201
- h['version'] = [1,0]
202
- h['leader'] = self.leader
203
-
204
- fields = []
205
-
206
- self.getVariableFields.each do |f|
207
- if f.controlField?
208
- fields << [f.tag, f.value]
209
- else
210
- farray = [f.tag, f.indicator1 || ' ', f.indicator2 || ' ']
211
- subs = []
212
- f.each do |subfield|
213
- subs << [subfield.code, subfield.value]
214
- end
215
- farray.push subs
216
- fields << farray
217
- end
218
- end
219
- h['fields'] = fields
220
- return h
221
- end
222
-
223
- # Create a local hash by tag number; makes some stuff faster
224
- # Called automatically if you use reader.each
225
-
226
- def hashify
227
- return if @hashedtags # don't do it more than once
228
- @hashedtags = {}
229
- self.getVariableFields.each do |f|
230
- @hashedtags[f.tag] ||= []
231
- @hashedtags[f.tag].push f
232
- end
233
- end
234
-
235
- # Create a nice string of the record
236
- def to_s
237
- arr = ['LEADER ' + self.leader]
238
- self.each do |f|
239
- arr.push f.to_s
240
- end
241
- return arr.join("\n")
242
- end
243
-
244
- # Get the leader as a string (marc4j would otherwise return Leader object)
245
- def leader
246
- self.get_leader.toString
247
- end
248
-
249
-
250
- # Cycle through the fields in the order the appear in the record
251
- def each
252
- self.getVariableFields.each do |f|
253
- yield f
254
- end
255
- end
256
-
257
- # Get the first field associated with a tag
258
- # @param [String] tag The tag
259
- # @return [Field] The first matching field, or nil if none. Note that
260
- # to mirror ruby-marc, this returns a single field
15
+ module IOConvert
261
16
 
262
- def [] tag
263
- if defined? @hashedtags
264
- if @hashedtags[tag]
265
- return @hashedtags[tag][0]
266
- else
267
- return nil
268
- end
269
- else
270
- return self.getVariableField(tag)
17
+ def byteinstream(fromwhere)
18
+ stream = nil
19
+ if fromwhere.is_a? Java::JavaIO::InputStream
20
+ stream = fromwhere
21
+ elsif fromwhere.is_a? String
22
+ stream = java.io.FileInputStream.new(fromwhere.to_java_string)
23
+ elsif fromwhere.respond_to? :to_inputstream
24
+ stream = fromwhere.to_inputstream
271
25
  end
26
+ return stream
272
27
  end
273
-
274
-
275
- # Get a (possibly empty) list of fields with the given tag(s)
276
- #
277
- # @param [String, Array<String>] tags A string (or Array of strings) with the tags you're interested in
278
- # @param [Boolean] originalorder Whether or not results should be presented in the original order within the
279
- # record or with a two-column sort of (a) Order of the tag in the list of tags sent, (b) order within that tag
280
- # in the record
281
- # @return [Array<Field>] Either an empty list or a list of one or more matched fields will be returned.
282
- #
283
- # originalorder == false will use an internal hash and be faster in many cases (see #hashify)
284
- #
285
- # @example originalorder == false
286
- # # Given a record that looks like
287
- # # 010 $a 68027371
288
- # # 035 $a (RLIN)MIUG0001728-B
289
- # # 035 $a (CaOTULAS)159818044
290
- # # 035 $a (OCoLC)ocm00001728
291
- #
292
- # r.find_by_tag(['035', '010']).each {|f| puts f.to_s}
293
- # # 035 $a (RLIN)MIUG0001728-B
294
- # # 035 $a (CaOTULAS)159818044
295
- # # 035 $a (OCoLC)ocm00001728
296
- # # 010 $a 68027371
297
- #
298
- # # The results are ordered first by tag as passed in, then by original order within the tag
299
- #
300
- # @example Just get all fields for a single tag
301
- # ohThirtyFives = r.find_by_tag('035')
302
- #
303
- # @example Get a bunch of standard identifiers
304
- # standardIDs = r.find_by_tag(['022', '020', '010'])
305
- #
306
- # @example originalorder == true
307
- # r.find_by_tag(['035', '010'], true).each {|f| puts f.to_s}
308
- # # 010 $a 68027371
309
- # # 035 $a (RLIN)MIUG0001728-B
310
- # # 035 $a (CaOTULAS)159818044
311
- # # 035 $a (OCoLC)ocm00001728
312
-
313
- def find_by_tag(tags, originalorder = false)
314
- self.hashify unless @hashedtags and !originalorder
315
- if !tags.is_a? Array
316
- return @hashedtags[tags] || []
317
- end
318
- if originalorder
319
- return self.find_all {|f| tags.include? f.tag}
320
- else
321
- # puts "Tags is #{tags}: got #{@hashedtags.values_at(*tags)}"
322
- return @hashedtags.values_at(*tags).flatten.compact
323
- end
324
- end
325
-
326
28
 
327
-
328
- # Return the record as valid MARC-XML
329
- # @return String A MARC-XML representation of the record, including the XML header
330
- def to_xml
331
- return @xml if @xml
332
- begin
333
- @xml = java.io.StringWriter.new
334
- res = javax.xml.transform.stream.StreamResult.new(@xml)
335
- writer = org.marc4j.MarcXmlWriter.new(res)
336
- writer.write(self)
337
- writer.writeEndDocument
338
- return @xml.toString
339
- rescue
340
- "Woops! to_xml failed for record #{self['001'].data}: #{$!}"
341
- end
342
- end
343
-
344
- def to_marc
345
- begin
346
- s = Java::java.io.ByteArrayOutputStream.new
347
- writer = org.marc4j.MarcStreamWriter.new(s)
348
- writer.write(self)
349
- @marcbinary = s.to_string
350
- puts @marcbinary
351
- return @marcbinary
352
- rescue
353
- # "Woops! to_marc failed for record #{self['001'].data}: #{$!}"
354
- "Whoops! Failed: #{$!}"
29
+ def byteoutstream towhere
30
+ stream = nil
31
+ if towhere.is_a? Java::JavaIO::OutputStream
32
+ stream = towhere
33
+ elsif towhere.is_a? String
34
+ stream = java.io.FileOutputStream.new(towhere.to_java_string)
35
+ elsif towhere.respond_to? :to_outputstream
36
+ stream = towhere.to_outputstream
355
37
  end
356
- end
357
-
358
-
359
- end
360
-
361
- class ControlFieldImpl
362
- def value
363
- return self.data
38
+ return stream
364
39
  end
365
40
 
366
- def controlField?
367
- return true
368
- end
369
41
 
370
- def self.control_tag? tag
371
- return Java::org.marc4j.marc.impl.Verifier.isControlField tag
372
- end
42
+ module_function :byteinstream, :byteoutstream
373
43
 
374
- # Pretty-print
375
- # @param [String] joiner What string to use to join the subfields
376
- # @param [String] The pretty string
377
- def to_s
378
- return self.tag + " " + self.value
379
- end
380
-
381
44
  end
382
-
383
- class DataFieldImpl
384
- include Enumerable
385
45
 
386
- alias_method :<<, :addSubfield
387
46
 
388
47
 
389
- def controlField?
390
- return false
391
- end
392
48
 
393
- # Broken. Need to check subs as well
394
- def == other
395
- self.tag == other.tag and
396
- self.indicator1 == other.indicator1 and
397
- self.indicator2 == other.indicator2
398
- end
399
-
400
- # Pretty-print
401
- # @param [String] joiner What string to use to join the subfields
402
- # @param [String] The pretty string
403
- def to_s (joiner = ' ')
404
- arr = [self.tag + ' ' + self.indicator1 + self.indicator2]
405
- self.each do |s|
406
- arr.push s.to_s
407
- end
408
- return arr.join(joiner)
409
- end
410
-
411
- # Get the value of the first subfield of this field with the given code
412
- # @param [String] code 1-character string of the subfield code
413
- # @return [String] The value of the first matched subfield
414
- def [] code
415
- raise ArgumentError, "Code must be a one-character string, not #{code}" unless code.is_a? String and code.size == 1
416
- # need to send a char value that the underlying java can deal with
417
- sub = self.getSubfield(code[0].ord)
418
- if (sub)
419
- return sub.getData
420
- else
421
- return nil
422
- end
423
- end
424
-
425
49
 
426
- # Get all values from the subfields for the given code or array of codes
427
- # @param [String, Array<String>] code (Array of?) 1-character string(s) of the subfield code
428
- # @param [Boolean] myorder Use the order of subfields that I gave instead of the order they're in the record
429
- # @return [Array<String>] A possibly-empty array of Strings made up of the values in the subfields whose
430
- # code is included in the given codes. If myorder == true, use the order in which they are passed in; if a code is repeated
431
- # (ocassionally legal) subfield values will appear first ordered by the passed array, then by order within
432
- # the document.
433
- #
434
- # If myorder is false, just return the values for matching subfields in the order they appear in the field.
435
- #
436
- # @example Quick examples:
437
- # # 260 $a New York, $b Van Nostrand Reinhold Co. $c 1969
438
- # rec['260'].sub_values('a') #=> ["New York,"]
439
- # rec['260'].sub_values(['a', 'c']) #=> ["New York,", "1969"]
440
- # rec['260'].sub_values(['c', 'a']) #=> ["New York,", "1969"]
441
- # rec['260'].sub_values(['c', 'a'], true) #=> ["1969", "New York"]
442
-
443
- def sub_values(code, myorder = false)
444
-
445
- # Do a little razzle-dazzle for the common case when a single code is given
446
- if not [Set, Array].include? code.class
447
- c = code
448
- elsif code.size == 1
449
- c = code.first
450
- end
451
- if c
452
- return self.find_all { |s| c == s.code}.map {|s| s.data}
453
- end
454
-
455
- # unless [Set, Array].include? code.class
456
- # code = [code]
457
- # # puts "Arrayified for code #{code} / #{code.class}"
458
- # end
459
- if myorder
460
- subs = []
461
- code.each do |c|
462
- subs << self.find_all {|s| c == s.code}
463
- end
464
- return subs.flatten.map {|s| s.data}
465
- else
466
- return self.find_all{|s| code.include? s.code}.map {|s| s.data}
467
- end
468
- end
469
-
470
- # Get first indicator as a one-character string
471
- def indicator1
472
- return self.getIndicator1.chr
473
- end
474
-
475
- # Get second indicator as a one-character string
476
- def indicator2
477
- return self.getIndicator2.chr
478
- end
479
-
480
- # Iterate over the subfields
481
- def each
482
- self.getSubfields.each do |s|
483
- yield s
484
- end
485
- end
486
-
487
- # Get the concatentated values of the subfields in order the appear in the field
488
- # @param [String] joiner The string used to join the subfield values
489
- def value joiner=' '
490
- data = self.getSubfields.map {|s| s.data}
491
- return data.join(joiner)
492
- end
493
- end
494
-
495
- class SubfieldImpl
496
-
497
- def == other
498
- return ((self.code == other.code) and (self.data == other.data))
499
- end
500
-
501
- def value
502
- return self.data
503
- end
504
-
505
- def code
506
- return self.getCode.chr
507
- end
508
-
509
- def to_s
510
- return '$' + self.code + " " + self.data
511
- end
512
- end
50
+ require 'marc4j4r/record.rb'
51
+ require 'marc4j4r/controlfield.rb'
52
+ require 'marc4j4r/reader.rb'
53
+ require 'marc4j4r/datafield.rb'