marc4j4r 1.4.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +3 -0
- data/.gitignore +4 -0
- data/.yardopts +1 -0
- data/ChangeLog.md +40 -0
- data/Gemfile +16 -0
- data/LICENSE.txt +20 -0
- data/README.md +164 -0
- data/Rakefile +34 -0
- data/jars/jackson-all-1.6.0.jar +0 -0
- data/jars/javamarc.jar +0 -0
- data/jars/marc4j-extra-readers-writers.jar +0 -0
- data/lib/marc4j4r.rb +73 -0
- data/lib/marc4j4r/controlfield.rb +34 -0
- data/lib/marc4j4r/datafield.rb +195 -0
- data/lib/marc4j4r/reader.rb +129 -0
- data/lib/marc4j4r/record.rb +257 -0
- data/lib/marc4j4r/version.rb +4 -0
- data/lib/marc4j4r/writer.rb +34 -0
- data/marc4j4r.gemspec +21 -0
- data/spec/alephsequentialreader_spec.rb +111 -0
- data/spec/bad.dat +1 -0
- data/spec/badbatch.dat +1 -0
- data/spec/badbatch.xml +13 -0
- data/spec/batch.dat +1 -0
- data/spec/batch.txt +193 -0
- data/spec/batch.xml +13 -0
- data/spec/chinese_utf8.dat +1 -0
- data/spec/controlfield_spec.rb +42 -0
- data/spec/datafield_spec.rb +115 -0
- data/spec/errors.seq +118 -0
- data/spec/helper.rb +21 -0
- data/spec/one.dat +1 -0
- data/spec/one.txt +17 -0
- data/spec/one.xml +4 -0
- data/spec/reader_spec.rb +112 -0
- data/spec/record_spec.rb +146 -0
- data/spec/test_marc4j4r.rb +12 -0
- data/spec/three.seq +118 -0
- metadata +116 -0
@@ -0,0 +1,129 @@
|
|
1
|
+
import 'org.marc4j.ErrorHandler'
|
2
|
+
require 'jlogger'
|
3
|
+
|
4
|
+
module MarcReader
|
5
|
+
module LoggingNextRecord
|
6
|
+
def nextRecord(hashify=true)
|
7
|
+
begin
|
8
|
+
r = self.next
|
9
|
+
# rescue Java::org.marc4j.MarcException => e
|
10
|
+
rescue org.marc4j.MarcException => e
|
11
|
+
puts "#{e}"
|
12
|
+
raise e
|
13
|
+
end
|
14
|
+
self.logErrors if self.methods.include? 'errors'
|
15
|
+
r.hashify if hashify
|
16
|
+
return r
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
module Java::OrgMarc4j::MarcReader
|
22
|
+
include Enumerable
|
23
|
+
include JLogger::Simple
|
24
|
+
|
25
|
+
|
26
|
+
def logErrors
|
27
|
+
return unless self.errors.getErrors
|
28
|
+
self.errors.getErrors.each do |err|
|
29
|
+
case err.severity
|
30
|
+
when ErrorHandler::MAJOR_ERROR
|
31
|
+
log.error err.toString
|
32
|
+
when ErrorHandler::ERROR_TYPO, ErrorHandler::MINOR_ERROR
|
33
|
+
self.log.warn err.toString
|
34
|
+
when ErrorHandler::INFO
|
35
|
+
log.info err.toString
|
36
|
+
when ErrorHandler::FATAL
|
37
|
+
log.error err.toString
|
38
|
+
Process.exit
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def nextRecord(hashify = true)
|
44
|
+
r = self.next
|
45
|
+
r.hashify if hashify
|
46
|
+
return r
|
47
|
+
end
|
48
|
+
|
49
|
+
# Return the next record, after calling #hashify on it
|
50
|
+
def each(hashify=true)
|
51
|
+
while self.hasNext
|
52
|
+
r = self.nextRecord(hashify)
|
53
|
+
yield r
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
module MARC4J4R
|
60
|
+
|
61
|
+
class Reader
|
62
|
+
|
63
|
+
ENCODINGS = ['UTF-8', 'ISO-8859-1', 'MARC-8']
|
64
|
+
ENCODING_ALIASES = {:utf8 => 'UTF-8', :marc8 => 'MARC-8', :iso => 'ISO-8859-1'}
|
65
|
+
|
66
|
+
# @attr_reader [File] handle The handle of the File (or IO) object being read from
|
67
|
+
attr_reader :handle
|
68
|
+
|
69
|
+
# Get a marc reader of the appropriate type
|
70
|
+
# @param [String, IO, java.io.InputStream] input The IO stream (or filename) from which you want to read
|
71
|
+
# @param [:strictmarc, :permissivemarc, :marcxml] The type of MARC reader you want.
|
72
|
+
# @param [:utf8, :iso, :marc8, 'UTF-8', 'ISO-8859-1', 'MARC-8'] An explicit encoding
|
73
|
+
# @return [MarcReader] A MarcReader object with the syntactic sugar added in this file (e.g, each)
|
74
|
+
#
|
75
|
+
# @example Get a strict binary MARC reader for the file 'test.mrc'
|
76
|
+
# reader = MARC4J4R::Reader.new('test.mrc')
|
77
|
+
# reader = MARC4J4R::Reader.new('test.mrc', :strictmarc) # same thing; :strictmarc is the default
|
78
|
+
#
|
79
|
+
# @example Get a strict binary MARC reader for the file 'test.mrc', force input to be treated as utf-8
|
80
|
+
# reader = MARC4J4R::Reader.new('test.mrc', :strictmarc, :utf8)
|
81
|
+
#
|
82
|
+
# @example Get a permissive binary MARC reader
|
83
|
+
# reader = MARC4J4R::Reader.new('test.mrc', :permissivemarc)
|
84
|
+
#
|
85
|
+
# @example Get a reader for an xml file
|
86
|
+
# reader = MARC4J4R::Reader.new('test.xml', :marcxml)
|
87
|
+
#
|
88
|
+
# @example Get a reader based on an existing IO object
|
89
|
+
# require 'open-uri'
|
90
|
+
# infile = open('http://my.machine.com/test.mrc')
|
91
|
+
# reader = MARC4J4R::Reader.new(infile)
|
92
|
+
|
93
|
+
def self.new(input, type = :strictmarc, encoding = nil)
|
94
|
+
if encoding
|
95
|
+
encoding = ENCODING_ALIASES[encoding] if ENCODING_ALIASES[encoding]
|
96
|
+
unless ENCODINGS.include? encoding
|
97
|
+
raise ArgumentError, "Encoding must be in [#{ENCODINGS.map {|x| '"' + x + '"'}.join(', ')}], not \"#{encoding}\""
|
98
|
+
end
|
99
|
+
end
|
100
|
+
@handle = IOConvert.byteinstream(input)
|
101
|
+
case type
|
102
|
+
when :strictmarc then
|
103
|
+
Java::org.marc4j.MarcStreamReader.send(:include, Enumerable)
|
104
|
+
return Java::org.marc4j.MarcStreamReader.new(@handle, encoding)
|
105
|
+
when :permissivemarc then
|
106
|
+
encoding ||= 'BESTGUESS'
|
107
|
+
Java::org.marc4j.MarcPermissiveStreamReader.send(:include, Enumerable)
|
108
|
+
Java::org.marc4j.MarcPermissiveStreamReader.send(:include, JLogger::Simple)
|
109
|
+
Java::org.marc4j.MarcPermissiveStreamReader.send(:include, MarcReader::LoggingNextRecord)
|
110
|
+
return Java::org.marc4j.MarcPermissiveStreamReader.new(@handle, true, true, encoding)
|
111
|
+
when :marcxml then
|
112
|
+
Java::org.marc4j.MarcXmlReader.send(:include, Enumerable)
|
113
|
+
Java::org.marc4j.MarcXmlReader.send(:include, JLogger::Simple)
|
114
|
+
return Java::org.marc4j.MarcXmlReader.new(@handle)
|
115
|
+
when :alephsequential then
|
116
|
+
Java::org.marc4j.MarcAlephSequentialReader.send(:include, Enumerable)
|
117
|
+
Java::org.marc4j.MarcAlephSequentialReader.send(:include, JLogger::Simple)
|
118
|
+
Java::org.marc4j.MarcAlephSequentialReader.send(:include, MarcReader::LoggingNextRecord)
|
119
|
+
return Java::org.marc4j.MarcAlephSequentialReader.new(@handle)
|
120
|
+
when :json then
|
121
|
+
Java::org.marc4j.MarcJsonReader.send(:include, Enumerable)
|
122
|
+
Java::org.marc4j.MarcJsonReader.send(:include, JLogger::Simple)
|
123
|
+
return Java::org.marc4j.MarcJsonReader.new(@handle)
|
124
|
+
else
|
125
|
+
raise ArgumentError, "Reader type #{type} illegal: must be :strictmarc, :permissivemarc, :marcxml, or :alephsequential"
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,257 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
module MARC4J4R
|
3
|
+
Record = Java::org.marc4j.marc.impl::RecordImpl
|
4
|
+
|
5
|
+
class Record
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
alias_method :<<, :addVariableField
|
9
|
+
alias_method :append, :addVariableField
|
10
|
+
alias_method :fields, :getVariableFields
|
11
|
+
|
12
|
+
# Export as a MARC-Hash, as described at
|
13
|
+
# http://robotlibrarian.billdueber.com/marc-hash-the-saga-continues-now-with-even-less-structure/
|
14
|
+
# @return A marc-hash representation of the record, suitable for calling .to_json on or whatever
|
15
|
+
|
16
|
+
# Show equality
|
17
|
+
|
18
|
+
def == other
|
19
|
+
return false unless (self.leader == other.leader)
|
20
|
+
self.zip(other) do |so|
|
21
|
+
unless so[0] == so[1]
|
22
|
+
puts "self <> other\n#{so[0]}\n#{so[1]}"
|
23
|
+
return false;
|
24
|
+
end
|
25
|
+
end
|
26
|
+
other.zip(self) do |so|
|
27
|
+
unless so[0] == so[1]
|
28
|
+
puts "#{so[0]}\n#{so[1]}"
|
29
|
+
return false;
|
30
|
+
end
|
31
|
+
end
|
32
|
+
return true
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# Create a local hash by tag number; makes some stuff faster
|
37
|
+
# Called automatically if you use reader.each
|
38
|
+
|
39
|
+
def hashify
|
40
|
+
return if @hashedtags # don't do it more than once
|
41
|
+
@hashedtags = {}
|
42
|
+
self.getVariableFields.each do |f|
|
43
|
+
@hashedtags[f.tag] ||= []
|
44
|
+
@hashedtags[f.tag].push f
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Force a re-hash
|
49
|
+
def rehash
|
50
|
+
@hashedtags = nil
|
51
|
+
hashify
|
52
|
+
end
|
53
|
+
|
54
|
+
# Create a nice string of the record
|
55
|
+
def to_s
|
56
|
+
arr = ['LEADER ' + self.leader]
|
57
|
+
self.each do |f|
|
58
|
+
arr.push f.to_s
|
59
|
+
end
|
60
|
+
return arr.join("\n")
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get the leader as a string (marc4j would otherwise return Leader object)
|
64
|
+
def leader
|
65
|
+
self.get_leader.toString
|
66
|
+
end
|
67
|
+
|
68
|
+
# Set the leader
|
69
|
+
# @throw RuntimeError if leader is illegal
|
70
|
+
def leader= str
|
71
|
+
begin
|
72
|
+
self.set_leader Java::org.marc4j.marc.impl.LeaderImpl.new(str)
|
73
|
+
rescue Java::java.lang.StringIndexOutOfBoundsException => e
|
74
|
+
raise RuntimeError.new("'#{str}' not a legal leader: #{e.message}")
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# Cycle through the fields in the order they appear in the record
|
79
|
+
def each(&blk)
|
80
|
+
self.getVariableFields.each(&blk)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Get the first field associated with a tag
|
84
|
+
# @param [String] tag The tag
|
85
|
+
# @return [Field] The first matching field, or nil if none. Note that
|
86
|
+
# to mirror ruby-marc, this returns a single field
|
87
|
+
|
88
|
+
def [] tag
|
89
|
+
if defined? @hashedtags
|
90
|
+
if @hashedtags[tag]
|
91
|
+
return @hashedtags[tag][0]
|
92
|
+
else
|
93
|
+
return nil
|
94
|
+
end
|
95
|
+
else
|
96
|
+
return self.getVariableField(tag)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
# Get a (possibly empty) list of fields with the given tag(s)
|
102
|
+
#
|
103
|
+
# @param [String, Array<String>] tags A string (or Array of strings) with the tags you're interested in
|
104
|
+
# @param [Boolean] originalorder Whether or not results should be presented in the original order within the
|
105
|
+
# record or with a two-column sort of (a) Order of the tag in the list of tags sent, (b) order within that tag
|
106
|
+
# in the record
|
107
|
+
# @return [Array<Field>] Either an empty list or a list of one or more matched fields will be returned.
|
108
|
+
#
|
109
|
+
# originalorder == false will use an internal hash and be faster in many cases (see #hashify)
|
110
|
+
#
|
111
|
+
# @example originalorder == false
|
112
|
+
# # Given a record that looks like
|
113
|
+
# # 010 $a 68027371
|
114
|
+
# # 035 $a (RLIN)MIUG0001728-B
|
115
|
+
# # 035 $a (CaOTULAS)159818044
|
116
|
+
# # 035 $a (OCoLC)ocm00001728
|
117
|
+
#
|
118
|
+
# r.find_by_tag(['035', '010']).each {|f| puts f.to_s}
|
119
|
+
# # 035 $a (RLIN)MIUG0001728-B
|
120
|
+
# # 035 $a (CaOTULAS)159818044
|
121
|
+
# # 035 $a (OCoLC)ocm00001728
|
122
|
+
# # 010 $a 68027371
|
123
|
+
#
|
124
|
+
# # The results are ordered first by tag as passed in, then by original order within the tag
|
125
|
+
#
|
126
|
+
# @example Just get all fields for a single tag
|
127
|
+
# ohThirtyFives = r.find_by_tag('035')
|
128
|
+
#
|
129
|
+
# @example Get a bunch of standard identifiers
|
130
|
+
# standardIDs = r.find_by_tag(['022', '020', '010'])
|
131
|
+
#
|
132
|
+
# @example originalorder == true
|
133
|
+
# r.find_by_tag(['035', '010'], true).each {|f| puts f.to_s}
|
134
|
+
# # 010 $a 68027371
|
135
|
+
# # 035 $a (RLIN)MIUG0001728-B
|
136
|
+
# # 035 $a (CaOTULAS)159818044
|
137
|
+
# # 035 $a (OCoLC)ocm00001728
|
138
|
+
|
139
|
+
def find_by_tag(tags, originalorder = false)
|
140
|
+
self.hashify unless @hashedtags and !originalorder
|
141
|
+
if !tags.is_a? Array
|
142
|
+
return @hashedtags[tags] || []
|
143
|
+
end
|
144
|
+
if originalorder
|
145
|
+
return self.find_all {|f| tags.include? f.tag}
|
146
|
+
else
|
147
|
+
# puts "Tags is #{tags}: got #{@hashedtags.values_at(*tags)}"
|
148
|
+
return @hashedtags.values_at(*tags).flatten.compact
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
|
154
|
+
# Return the record as valid MARC-XML
|
155
|
+
# @param String encoding The encoding to use
|
156
|
+
# @return String A MARC-XML representation of the record, including the XML header
|
157
|
+
|
158
|
+
def to_xml
|
159
|
+
return Java::org.marc4j.MarcXmlWriter.record_to_XML(self)
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def to_marc encoding='UTF-8'
|
164
|
+
# begin
|
165
|
+
s = Java::java.io.ByteArrayOutputStream.new
|
166
|
+
writer = org.marc4j.MarcPermissiveStreamWriter.new(s, encoding)
|
167
|
+
writer.write(self)
|
168
|
+
return s.to_string
|
169
|
+
# writer.close
|
170
|
+
# @marcbinary = s.to_string
|
171
|
+
# return @marcbinary
|
172
|
+
# rescue
|
173
|
+
# # "Woops! to_marc failed for record #{self['001'].data}: #{$!}"
|
174
|
+
# "Whoops! Failed: #{$!}"
|
175
|
+
# end
|
176
|
+
end
|
177
|
+
|
178
|
+
def to_marchash
|
179
|
+
h = {}
|
180
|
+
h['type'] = 'marc-hash'
|
181
|
+
h['version'] = [1,0]
|
182
|
+
h['leader'] = self.leader
|
183
|
+
|
184
|
+
fields = []
|
185
|
+
|
186
|
+
self.getVariableFields.each do |f|
|
187
|
+
if f.controlField?
|
188
|
+
fields << [f.tag, f.value]
|
189
|
+
else
|
190
|
+
farray = [f.tag, f.indicator1 || ' ', f.indicator2 || ' ']
|
191
|
+
subs = []
|
192
|
+
f.each do |subfield|
|
193
|
+
subs << [subfield.code, subfield.value]
|
194
|
+
end
|
195
|
+
farray.push subs
|
196
|
+
fields << farray
|
197
|
+
end
|
198
|
+
end
|
199
|
+
h['fields'] = fields
|
200
|
+
return h
|
201
|
+
end
|
202
|
+
|
203
|
+
# Turn it into a marc-in-json hashmap. Note that this won't really work
|
204
|
+
# like a ruby hash; you need to know what you're getting, since stuff
|
205
|
+
# like #each won't work.
|
206
|
+
#
|
207
|
+
# Better to just use to_marc_in_json if you want a json string
|
208
|
+
|
209
|
+
def to_hash
|
210
|
+
return Java::org.marc4j.MarcInJSON.record_to_hash(self)
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
# Turn it into a marc-in-json JSON string using Jackson
|
215
|
+
def to_marc_in_json
|
216
|
+
return Java::org.marc4j.MarcInJSON.record_to_marc_in_json(self)
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
|
223
|
+
|
224
|
+
# Give a marc record in a string, turn it into an object
|
225
|
+
# @param String str The record as a MARC binary string
|
226
|
+
# @return MARC4J4R::Record The first record encoded in the string
|
227
|
+
#
|
228
|
+
# Note that the normal way of defining this class (self.from_string)
|
229
|
+
# didn't work; I assume it has something to do with the fact that
|
230
|
+
# it's actually jrst aliased to the Java class
|
231
|
+
def Record.from_string str, encoding=nil
|
232
|
+
s = Java::java.io.ByteArrayInputStream.new(str.to_java_bytes)
|
233
|
+
# return MARC4J4R::Reader.new(StringIO.new(str), :strictmarc, encoding).first
|
234
|
+
return MARC4J4R::Reader.new(s, :strictmarc, encoding).first
|
235
|
+
end
|
236
|
+
|
237
|
+
|
238
|
+
# Give a marc-xml record in a string, turn it into an object
|
239
|
+
# @param String str The record as a MARC-XML string
|
240
|
+
# @return MARC4J4R::Record The first record encoded in the string
|
241
|
+
def Record.from_xml_string str
|
242
|
+
return MARC4J4R::Reader.new(StringIO.new(str), :marcxml).first
|
243
|
+
end
|
244
|
+
|
245
|
+
def Record.new_from_hash hash
|
246
|
+
return Java::org.marc4j.MarcInJSON.new_from_hash(hash)
|
247
|
+
end
|
248
|
+
|
249
|
+
def Record.new_from_marc_in_json jsonstring
|
250
|
+
return Java::org.marc4j.MarcInJSON.new_from_marc_in_json(jsonstring)
|
251
|
+
end
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
end
|
257
|
+
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module MARC4J4R
|
2
|
+
# Add some sugar to the MarcWriter interface
|
3
|
+
#
|
4
|
+
# Adjust the interface so that a #new call to any implementations that
|
5
|
+
# implement it can take a java.io.InputStream, ruby IO object, or String
|
6
|
+
# (that will be interpreted as a filename) without complaining.
|
7
|
+
#
|
8
|
+
# The mechanism -- running module_eval on a string-representation of the
|
9
|
+
# new method in each of the hard-coded implementations -- is ugly
|
10
|
+
# and deeply unsettling.
|
11
|
+
#
|
12
|
+
# @author Bill Dueber
|
13
|
+
#
|
14
|
+
|
15
|
+
class Writer
|
16
|
+
|
17
|
+
# A simple factory to return the correct type of writer
|
18
|
+
def self.new output, type = :strictmarc
|
19
|
+
@handle = IOConvert.byteoutstream(output)
|
20
|
+
if type == :strictmarc
|
21
|
+
return Java::org.marc4j.MarcStreamWriter.new(@handle)
|
22
|
+
elsif type == :marcxml
|
23
|
+
writer = Java::org.marc4j.MarcXmlWriter.new(@handle)
|
24
|
+
writer.setUnicodeNormalization(true)
|
25
|
+
return writer
|
26
|
+
elsif type == :json
|
27
|
+
writer = Java::org.marc4j.MarcJsonWriter.new(@handle)
|
28
|
+
return writer
|
29
|
+
else
|
30
|
+
raise ArgumentError.new("#{type} must be :strictmarc, :marcxml, or :json")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/marc4j4r.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
require File.expand_path('../lib/marc4j4r/version', __FILE__)
|
4
|
+
|
5
|
+
Gem::Specification.new do |gem|
|
6
|
+
gem.name = "marc4j4r"
|
7
|
+
gem.platform = 'java'
|
8
|
+
gem.version = MARC4J4R::VERSION
|
9
|
+
gem.summary = %q{A minimal jruby wrapper around marc4j (http://marc4j.tigris.com)}
|
10
|
+
gem.description = %q{Syntactic sugar and some extra methods to deal with MARC data using a fork of the excellent java library marc4j}
|
11
|
+
gem.license = "MIT"
|
12
|
+
gem.authors = ["Bill Dueber"]
|
13
|
+
gem.email = "bill@dueber.com"
|
14
|
+
gem.homepage = "https://github.com/billdueber/marc4j4r#readme"
|
15
|
+
|
16
|
+
gem.files = `git ls-files`.split($/)
|
17
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
18
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
19
|
+
gem.require_paths = ['lib']
|
20
|
+
|
21
|
+
end
|