marc 1.1.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +116 -30
  7. data/Gemfile +5 -0
  8. data/README.md +239 -46
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -87
  22. data/lib/marc/reader.rb +116 -124
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -82
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +28 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -34
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +101 -94
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +129 -22
data/lib/marc/record.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  module MARC
2
-
3
2
  # The FieldMap is an Array of DataFields and Controlfields.
4
3
  # It also contains a Hash representation
5
4
  # of the fields for faster lookups (under certain conditions)
@@ -8,7 +7,7 @@ module MARC
8
7
  attr_accessor :clean
9
8
 
10
9
  def initialize
11
- @tags = {}
10
+ @tags = {}
12
11
  @clean = true
13
12
  end
14
13
 
@@ -16,7 +15,7 @@ module MARC
16
15
  # values of the fields Array
17
16
  def reindex
18
17
  @tags = {}
19
- self.each_with_index do |field, i|
18
+ each_with_index do |field, i|
20
19
  @tags[field.tag] ||= []
21
20
  @tags[field.tag] << i
22
21
  end
@@ -45,7 +44,7 @@ module MARC
45
44
  indices.compact!
46
45
  return [] if indices.empty?
47
46
 
48
- # Sort it, so we get the fields back in the order they appear in the record
47
+ # Sort it, so we get the fields back in the order they appear in the record
49
48
  indices.sort!
50
49
 
51
50
  indices.each do |tag|
@@ -53,13 +52,11 @@ module MARC
53
52
  end
54
53
  end
55
54
 
56
-
57
-
58
55
  # Freeze for immutability, first reindexing if needed.
59
56
  # A frozen FieldMap is safe for concurrent access, and also
60
57
  # can more easily avoid accidental reindexing on even read-only use.
61
58
  def freeze
62
- self.reindex unless @clean
59
+ reindex unless @clean
63
60
  super
64
61
  end
65
62
  end
@@ -110,19 +107,29 @@ module MARC
110
107
  include Enumerable
111
108
 
112
109
  # the record fields
113
- #attr_reader :fields
110
+ # attr_reader :fields
114
111
 
115
112
  # the record leader
116
113
  attr_accessor :leader
117
114
 
118
115
  def initialize
119
- @fields = FieldMap.new
116
+ @fields = FieldMap.new
120
117
  # leader is 24 bytes
121
- @leader = ' ' * 24
118
+ @leader = " " * 24
122
119
  # leader defaults:
123
120
  # http://www.loc.gov/marc/bibliographic/ecbdldrd.html
124
- @leader[10..11] = '22'
125
- @leader[20..23] = '4500'
121
+ @leader[10..11] = "22"
122
+ @leader[20..23] = "4500"
123
+ end
124
+
125
+ # Returns true if there are no error messages associated with the record
126
+ def valid?
127
+ errors.none?
128
+ end
129
+
130
+ # Returns an array of validation errors for all fields in the record
131
+ def errors
132
+ @fields.flat_map(&:errors)
126
133
  end
127
134
 
128
135
  # add a field to the record
@@ -152,7 +159,7 @@ module MARC
152
159
  # subjects = record.find_all {|f| ('600'..'699') === f.tag}
153
160
 
154
161
  def each
155
- for field in @fields
162
+ @fields.each do |field|
156
163
  yield field
157
164
  end
158
165
  end
@@ -167,14 +174,14 @@ module MARC
167
174
  # title = record['245']
168
175
 
169
176
  def [](tag)
170
- return self.find { |f| f.tag == tag }
177
+ find { |f| f.tag == tag }
171
178
  end
172
179
 
173
180
  # Provides a backwards compatible means to access the FieldMap.
174
181
  # No argument returns the FieldMap array in entirety. Providing
175
182
  # a string, array or range of tags will return an array of fields
176
183
  # in the order they appear in the record.
177
- def fields(filter=nil)
184
+ def fields(filter = nil)
178
185
  unless filter
179
186
  # Since we're returning the FieldMap object, which the caller
180
187
  # may mutate, we precautionarily mark dirty -- unless it's frozen
@@ -198,7 +205,7 @@ module MARC
198
205
 
199
206
  # Returns an array of all of the tags that appear in the record (not necessarily in the order they appear).
200
207
  def tags
201
- return @fields.tag_list
208
+ @fields.tag_list
202
209
  end
203
210
 
204
211
  # Factory method for creating a MARC::Record from MARC21 in
@@ -213,18 +220,17 @@ module MARC
213
220
  #
214
221
  # record = MARC::Record.new_from_marc(marc21, :forgiving => true)
215
222
 
216
- def self.new_from_marc(raw, params={})
217
- return MARC::Reader.decode(raw, params)
223
+ def self.new_from_marc(raw, params = {})
224
+ MARC::Reader.decode(raw, params)
218
225
  end
219
226
 
220
-
221
227
  # Returns a record in MARC21 transmission format (ANSI Z39.2).
222
228
  # Really this is just a wrapper around MARC::MARC21::encode
223
229
  #
224
230
  # marc = record.to_marc()
225
231
 
226
232
  def to_marc
227
- return MARC::Writer.encode(self)
233
+ MARC::Writer.encode(self)
228
234
  end
229
235
 
230
236
  # Handy method for returning the MARCXML serialization for a
@@ -232,9 +238,21 @@ module MARC
232
238
  # Really this is just a wrapper around MARC::XMLWriter::encode
233
239
  #
234
240
  # xml_doc = record.to_xml()
241
+ def to_xml(include_namespace: true)
242
+ MARC::XMLWriter.encode(self, include_namespace: include_namespace)
243
+ end
235
244
 
236
- def to_xml
237
- return MARC::XMLWriter.encode(self, :include_namespace => true)
245
+ # Create the actual XML string (as opposed to #to_xml which, for historic reasons,
246
+ # returns an REXML document)
247
+ # @param [Boolean] fast_but_unsafe Use the fast MARC::UnsafeXMLWriter code
248
+ # @param [Boolean] include_namespace Include namespaces on the <record> tag?
249
+ # @return [String] MARC-XML encoding of the record
250
+ def to_xml_string(fast_but_unsafe: false, include_namespace: true)
251
+ if fast_but_unsafe
252
+ MARC::UnsafeXMLWriter.encode(self, include_namespace: include_namespace)
253
+ else
254
+ MARC::XMLWriter.encode(self, include_namespace: include_namespace).to_s
255
+ end
238
256
  end
239
257
 
240
258
  # Handy method for returning a hash mapping this records values
@@ -244,95 +262,87 @@ module MARC
244
262
  # print dc['title']
245
263
 
246
264
  def to_dublin_core
247
- return MARC::DublinCore.map(self)
265
+ MARC::DublinCore.map(self)
248
266
  end
249
267
 
250
268
  # Return a marc-hash version of the record
251
269
  def to_marchash
252
- return {
253
- 'type' => 'marc-hash',
254
- 'version' => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION],
255
- 'leader' => self.leader,
256
- 'fields' => self.map { |f| f.to_marchash }
257
- }
270
+ {"type" => "marc-hash", "version" => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION], "leader" => leader, "fields" => map { |f| f.to_marchash }}
258
271
  end
259
272
 
260
- #to_hash
261
-
262
273
  # Factory method for creating a new MARC::Record from
263
274
  # a marchash object
264
275
  #
265
276
  # record = MARC::Record->new_from_marchash(mh)
266
277
 
267
278
  def self.new_from_marchash(mh)
268
- r = self.new()
269
- r.leader = mh['leader']
270
- mh['fields'].each do |f|
271
- if (f.length == 2)
279
+ r = new
280
+ r.leader = mh["leader"]
281
+ mh["fields"].each do |f|
282
+ if f.length == 2
272
283
  r << MARC::ControlField.new(f[0], f[1])
273
284
  elsif r << MARC::DataField.new(f[0], f[1], f[2], *f[3])
274
285
  end
275
286
  end
276
- return r
287
+ r
277
288
  end
278
289
 
279
-
280
290
  # Returns a (roundtrippable) hash representation for MARC-in-JSON
281
291
  def to_hash
282
- record_hash = {'leader' => @leader, 'fields' => []}
292
+ record_hash = {"leader" => @leader, "fields" => []}
283
293
  @fields.each do |field|
284
- record_hash['fields'] << field.to_hash
294
+ record_hash["fields"] << field.to_hash
285
295
  end
286
296
  record_hash
287
297
  end
288
298
 
299
+ # Return an actual json-encoded string.
300
+ def to_json_string
301
+ MARC::JSONLWriter.encode(self)
302
+ end
303
+
289
304
  def self.new_from_hash(h)
290
- r = self.new
291
- r.leader = h['leader']
292
- if h['fields']
293
- h['fields'].each do |position|
294
- position.each_pair do |tag, field|
295
- if field.is_a?(Hash)
296
- f = MARC::DataField.new(tag, field['ind1'], field['ind2'])
297
- field['subfields'].each do |pos|
298
- pos.each_pair do |code, value|
299
- f.append MARC::Subfield.new(code, value)
300
- end
305
+ r = new
306
+ r.leader = h["leader"]
307
+ h["fields"]&.each do |position|
308
+ position.each_pair do |tag, field|
309
+ if field.is_a?(Hash)
310
+ f = MARC::DataField.new(tag, field["ind1"], field["ind2"])
311
+ field["subfields"].each do |pos|
312
+ pos.each_pair do |code, value|
313
+ f.append MARC::Subfield.new(code, value)
301
314
  end
302
- r << f
303
- else
304
- r << MARC::ControlField.new(tag, field)
305
315
  end
316
+ r << f
317
+ else
318
+ r << MARC::ControlField.new(tag, field)
306
319
  end
307
320
  end
308
321
  end
309
- return r
322
+ r
310
323
  end
311
324
 
312
325
  # Returns a string version of the record, suitable for printing
313
326
 
314
327
  def to_s
315
328
  str = "LEADER #{leader}\n"
316
- self.each do |field|
317
- str += field.to_s() + "\n"
329
+ each do |field|
330
+ str += field.to_s + "\n"
318
331
  end
319
- return str
332
+ str
320
333
  end
321
334
 
322
-
323
335
  # For testing if two records can be considered equal.
324
336
 
325
337
  def ==(other)
326
- return self.to_s == other.to_s
338
+ to_s == other.to_s
327
339
  end
328
340
 
329
-
330
341
  # Handy for using a record in a regex:
331
342
  # if record =~ /Gravity's Rainbow/ then print "Slothrop" end
332
343
 
333
344
  def =~(regex)
334
- return self.to_s =~ regex
345
+ to_s =~ regex
335
346
  end
336
-
337
347
  end
338
348
  end
data/lib/marc/subfield.rb CHANGED
@@ -1,31 +1,33 @@
1
1
  module MARC
2
-
3
- # A class that represents an individual subfield within a DataField.
4
- # Accessor attributes include: code (letter subfield code) and value
5
- # (the content of the subfield). Both can be empty string, but should
6
- # not be set to nil.
2
+ # A class that represents an individual subfield within a DataField.
3
+ # Accessor attributes include: code (letter subfield code) and value
4
+ # (the content of the subfield). Both can be empty string, but should
5
+ # not be set to nil.
7
6
 
8
7
  class Subfield
9
8
  attr_accessor :code, :value
10
9
 
11
- def initialize(code='' ,value='')
10
+ def initialize(code = "", value = "")
12
11
  # can't allow code of value to be nil
13
12
  # or else it'll screw us up later on
14
- @code = code == nil ? '' : code
15
- @value = value == nil ? '' : value
13
+ @code = code.nil? ? "" : code
14
+ @value = value.nil? ? "" : value
16
15
  end
17
16
 
18
17
  def ==(other)
18
+ if !other.is_a?(Subfield)
19
+ return false
20
+ end
19
21
  if @code != other.code
20
22
  return false
21
23
  elsif @value != other.value
22
24
  return false
23
25
  end
24
- return true
26
+ true
25
27
  end
26
28
 
27
29
  def to_s
28
- return "$#{code} #{value} "
30
+ "$#{code} #{value} "
29
31
  end
30
32
  end
31
33
  end
@@ -0,0 +1,93 @@
1
+ require "marc/xmlwriter"
2
+
3
+ module MARC
4
+ # UnsafeXMLWriter bypasses real xml handlers like REXML or Nokogiri and just concatenates strings
5
+ # to produce the XML document. This has no guarantees of validity if the MARC record you're encoding
6
+ # isn't valid and won't do things like entity expansion, but it does escape using ruby's
7
+ # String#encode(xml: :text) and it's much, much faster -- 4-5 times faster than using Nokogiri,
8
+ # and 15-20 times faster than the REXML version.
9
+ class UnsafeXMLWriter < MARC::XMLWriter
10
+ XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'
11
+ NS_ATTRS = %(xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/MARC21/slim" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd")
12
+
13
+ NS_COLLECTION = "<collection #{NS_ATTRS}>".freeze
14
+ COLLECTION = "<collection>".freeze
15
+ NS_RECORD = "<record #{NS_ATTRS}>".freeze
16
+ RECORD = "<record>".freeze
17
+
18
+ # Write the record to the target
19
+ # @param [MARC::Record] record
20
+ def write(record)
21
+ @fh.write(self.class.encode(record))
22
+ end
23
+
24
+ class << self
25
+ # Open `collection` tag, w or w/o namespace
26
+ def open_collection(include_namespace: true)
27
+ if include_namespace
28
+ NS_COLLECTION
29
+ else
30
+ COLLECTION
31
+ end
32
+ end
33
+
34
+ def open_record(include_namespace: true)
35
+ if include_namespace
36
+ NS_RECORD
37
+ else
38
+ RECORD
39
+ end
40
+ end
41
+
42
+ # Produce an XML string with a single document in a collection
43
+ # @param [MARC::Record] record
44
+ # @param [Boolean] include_namespace Whether to namespace the resulting XML
45
+ def single_record_document(record, include_namespace: true)
46
+ xml = XML_HEADER.dup
47
+ xml << open_collection(include_namespace: include_namespace)
48
+ xml << encode(record, include_namespace: false)
49
+ xml << "</collection>".freeze
50
+ xml
51
+ end
52
+
53
+ # Take a record and turn it into a valid MARC-XML string. Note that
54
+ # this is an XML _snippet_, without an XML header or <collection>
55
+ # enclosure.
56
+ # @param [MARC::Record] record The record to encode to XML
57
+ # @return [String] The XML snippet of the record in MARC-XML
58
+ def encode(record, include_namespace: true)
59
+ xml = open_record(include_namespace: include_namespace).dup
60
+
61
+ # MARCXML only allows alphanumerics or spaces in the leader
62
+ lead = fix_leader(record.leader)
63
+
64
+ xml << "<leader>" << lead.encode(xml: :text) << "</leader>"
65
+ record.each do |f|
66
+ if f.instance_of?(MARC::DataField)
67
+ xml << open_datafield(f.tag, f.indicator1, f.indicator2)
68
+ f.each do |sf|
69
+ xml << open_subfield(sf.code) << sf.value.encode(xml: :text) << "</subfield>"
70
+ end
71
+ xml << "</datafield>"
72
+ elsif f.instance_of?(MARC::ControlField)
73
+ xml << open_controlfield(f.tag) << f.value.encode(xml: :text) << "</controlfield>"
74
+ end
75
+ end
76
+ xml << "</record>"
77
+ xml.force_encoding("utf-8")
78
+ end
79
+
80
+ def open_datafield(tag, ind1, ind2)
81
+ "<datafield tag=\"#{tag}\" ind1=\"#{ind1}\" ind2=\"#{ind2}\">"
82
+ end
83
+
84
+ def open_subfield(code)
85
+ "<subfield code=\"#{code}\">"
86
+ end
87
+
88
+ def open_controlfield(tag)
89
+ "<controlfield tag=\"#{tag}\">"
90
+ end
91
+ end
92
+ end
93
+ end
data/lib/marc/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module MARC
2
- VERSION = "1.1.1"
2
+ VERSION = "1.3.0"
3
3
  end
data/lib/marc/writer.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  module MARC
2
-
3
2
  # A class for writing MARC records as binary MARC (ISO 2709)
4
3
  #
5
4
  # == Too-long records
@@ -29,65 +28,65 @@ module MARC
29
28
  # the constructor which you must pass a file path
30
29
  # or an object that responds to a write message
31
30
 
32
- def initialize(file)
33
- if file.class == String
34
- @fh = File.new(file,"w")
35
- elsif file.respond_to?('write')
31
+ def initialize(file, &blk)
32
+ if file.instance_of?(String)
33
+ @fh = File.new(file, "w")
34
+ elsif file.respond_to?(:write)
36
35
  @fh = file
37
36
  else
38
37
  raise ArgumentError, "must pass in file name or handle"
39
38
  end
40
39
  self.allow_oversized = false
41
- end
42
40
 
41
+ if block_given?
42
+ blk.call(self)
43
+ self.close
44
+ end
45
+ end
43
46
 
44
47
  # write a record to the file or handle
45
48
 
46
49
  def write(record)
47
- @fh.write(MARC::Writer.encode(record, self.allow_oversized))
50
+ @fh.write(MARC::Writer.encode(record, allow_oversized))
48
51
  end
49
52
 
50
-
51
53
  # close underlying filehandle
52
54
 
53
55
  def close
54
56
  @fh.close
55
57
  end
56
58
 
57
-
58
59
  # a static method that accepts a MARC::Record object
59
60
  # and returns the record encoded as MARC21 in transmission format
60
61
  #
61
62
  # Second arg allow_oversized, default false, set to true
62
- # to raise on MARC record that can't fit into ISO 2709.
63
+ # to raise on MARC record that can't fit into ISO 2709.
63
64
  def self.encode(record, allow_oversized = false)
64
- directory = ''
65
- fields = ''
65
+ directory = ""
66
+ fields = ""
66
67
  offset = 0
67
68
  record.each do |field|
68
-
69
69
  # encode the field
70
- field_data = ''
71
- if field.class == MARC::DataField
70
+ field_data = ""
71
+ if field.instance_of?(MARC::DataField)
72
72
  warn("Warn: Missing indicator") unless field.indicator1 && field.indicator2
73
73
  field_data = (field.indicator1 || " ") + (field.indicator2 || " ")
74
- for s in field.subfields
74
+ field.subfields.each do |s|
75
75
  field_data += SUBFIELD_INDICATOR + s.code + s.value
76
76
  end
77
- elsif field.class == MARC::ControlField
77
+ elsif field.instance_of?(MARC::ControlField)
78
78
  field_data = field.value
79
79
  end
80
80
  field_data += END_OF_FIELD
81
81
 
82
82
  # calculate directory entry for the field
83
83
  field_length = (field_data.respond_to?(:bytesize) ?
84
- field_data.bytesize() :
85
- field_data.length())
84
+ field_data.bytesize :
85
+ field_data.length)
86
86
  directory += sprintf("%03s", field.tag) + format_byte_count(field_length, allow_oversized, 4) + format_byte_count(offset, allow_oversized)
87
87
 
88
-
89
88
  # add field to data for other fields
90
- fields += field_data
89
+ fields += field_data
91
90
 
92
91
  # update offset for next field
93
92
  offset += field_length
@@ -100,19 +99,18 @@ module MARC
100
99
  marc = base + fields + END_OF_RECORD
101
100
 
102
101
  # update leader with the byte offest to the end of the directory
103
- bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length()
102
+ bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length
104
103
  marc[12..16] = format_byte_count(bytesize, allow_oversized)
105
-
106
104
 
107
105
  # update the record length
108
- bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length()
109
- marc[0..4] = format_byte_count(bytesize, allow_oversized)
106
+ bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length
107
+ marc[0..4] = format_byte_count(bytesize, allow_oversized)
110
108
 
111
109
  # store updated leader in the record that was passed in
112
- record.leader = marc[0..LEADER_LENGTH-1]
110
+ record.leader = marc[0..LEADER_LENGTH - 1]
113
111
 
114
112
  # return encoded marc
115
- return marc
113
+ marc
116
114
  end
117
115
 
118
116
  # Formats numbers for insertion into marc binary slots.
@@ -123,7 +121,7 @@ module MARC
123
121
  #
124
122
  # first arg is number, second is boolean whether to allow oversized,
125
123
  # third is max digits (default 5)
126
- def self.format_byte_count(number, allow_oversized, num_digits=5)
124
+ def self.format_byte_count(number, allow_oversized, num_digits = 5)
127
125
  formatted = sprintf("%0#{num_digits}i", number)
128
126
  if formatted.length > num_digits
129
127
  # uh, oh, we've exceeded our max. Either zero out
@@ -134,8 +132,7 @@ module MARC
134
132
  raise MARC::Exception.new("Can't write MARC record in binary format, as a length/offset value of #{number} is too long for a #{num_digits}-byte slot.")
135
133
  end
136
134
  end
137
- return formatted
135
+ formatted
138
136
  end
139
-
140
137
  end
141
138
  end