marc 1.0.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +106 -29
  7. data/Gemfile +15 -0
  8. data/README.md +240 -47
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -86
  22. data/lib/marc/reader.rb +119 -121
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -81
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +23 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -32
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +99 -87
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +94 -9
data/lib/marc/record.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  module MARC
2
-
3
2
  # The FieldMap is an Array of DataFields and Controlfields.
4
3
  # It also contains a Hash representation
5
4
  # of the fields for faster lookups (under certain conditions)
@@ -8,7 +7,7 @@ module MARC
8
7
  attr_accessor :clean
9
8
 
10
9
  def initialize
11
- @tags = {}
10
+ @tags = {}
12
11
  @clean = true
13
12
  end
14
13
 
@@ -16,7 +15,7 @@ module MARC
16
15
  # values of the fields Array
17
16
  def reindex
18
17
  @tags = {}
19
- self.each_with_index do |field, i|
18
+ each_with_index do |field, i|
20
19
  @tags[field.tag] ||= []
21
20
  @tags[field.tag] << i
22
21
  end
@@ -45,7 +44,7 @@ module MARC
45
44
  indices.compact!
46
45
  return [] if indices.empty?
47
46
 
48
- # Sort it, so we get the fields back in the order they appear in the record
47
+ # Sort it, so we get the fields back in the order they appear in the record
49
48
  indices.sort!
50
49
 
51
50
  indices.each do |tag|
@@ -53,13 +52,11 @@ module MARC
53
52
  end
54
53
  end
55
54
 
56
-
57
-
58
55
  # Freeze for immutability, first reindexing if needed.
59
56
  # A frozen FieldMap is safe for concurrent access, and also
60
57
  # can more easily avoid accidental reindexing on even read-only use.
61
58
  def freeze
62
- self.reindex unless @clean
59
+ reindex unless @clean
63
60
  super
64
61
  end
65
62
  end
@@ -110,19 +107,29 @@ module MARC
110
107
  include Enumerable
111
108
 
112
109
  # the record fields
113
- #attr_reader :fields
110
+ # attr_reader :fields
114
111
 
115
112
  # the record leader
116
113
  attr_accessor :leader
117
114
 
118
115
  def initialize
119
- @fields = FieldMap.new
116
+ @fields = FieldMap.new
120
117
  # leader is 24 bytes
121
- @leader = ' ' * 24
118
+ @leader = " " * 24
122
119
  # leader defaults:
123
120
  # http://www.loc.gov/marc/bibliographic/ecbdldrd.html
124
- @leader[10..11] = '22'
125
- @leader[20..23] = '4500'
121
+ @leader[10..11] = "22"
122
+ @leader[20..23] = "4500"
123
+ end
124
+
125
+ # Returns true if there are no error messages associated with the record
126
+ def valid?
127
+ errors.none?
128
+ end
129
+
130
+ # Returns an array of validation errors for all fields in the record
131
+ def errors
132
+ @fields.flat_map(&:errors)
126
133
  end
127
134
 
128
135
  # add a field to the record
@@ -152,7 +159,7 @@ module MARC
152
159
  # subjects = record.find_all {|f| ('600'..'699') === f.tag}
153
160
 
154
161
  def each
155
- for field in @fields
162
+ @fields.each do |field|
156
163
  yield field
157
164
  end
158
165
  end
@@ -167,14 +174,14 @@ module MARC
167
174
  # title = record['245']
168
175
 
169
176
  def [](tag)
170
- return self.find { |f| f.tag == tag }
177
+ find { |f| f.tag == tag }
171
178
  end
172
179
 
173
180
  # Provides a backwards compatible means to access the FieldMap.
174
181
  # No argument returns the FieldMap array in entirety. Providing
175
182
  # a string, array or range of tags will return an array of fields
176
183
  # in the order they appear in the record.
177
- def fields(filter=nil)
184
+ def fields(filter = nil)
178
185
  unless filter
179
186
  # Since we're returning the FieldMap object, which the caller
180
187
  # may mutate, we precautionarily mark dirty -- unless it's frozen
@@ -198,7 +205,7 @@ module MARC
198
205
 
199
206
  # Returns an array of all of the tags that appear in the record (not necessarily in the order they appear).
200
207
  def tags
201
- return @fields.tag_list
208
+ @fields.tag_list
202
209
  end
203
210
 
204
211
  # Factory method for creating a MARC::Record from MARC21 in
@@ -213,18 +220,17 @@ module MARC
213
220
  #
214
221
  # record = MARC::Record.new_from_marc(marc21, :forgiving => true)
215
222
 
216
- def self.new_from_marc(raw, params={})
217
- return MARC::Reader.decode(raw, params)
223
+ def self.new_from_marc(raw, params = {})
224
+ MARC::Reader.decode(raw, params)
218
225
  end
219
226
 
220
-
221
227
  # Returns a record in MARC21 transmission format (ANSI Z39.2).
222
228
  # Really this is just a wrapper around MARC::MARC21::encode
223
229
  #
224
230
  # marc = record.to_marc()
225
231
 
226
232
  def to_marc
227
- return MARC::Writer.encode(self)
233
+ MARC::Writer.encode(self)
228
234
  end
229
235
 
230
236
  # Handy method for returning the MARCXML serialization for a
@@ -232,9 +238,21 @@ module MARC
232
238
  # Really this is just a wrapper around MARC::XMLWriter::encode
233
239
  #
234
240
  # xml_doc = record.to_xml()
241
+ def to_xml(include_namespace: true)
242
+ MARC::XMLWriter.encode(self, include_namespace: include_namespace)
243
+ end
235
244
 
236
- def to_xml
237
- return MARC::XMLWriter.encode(self, :include_namespace => true)
245
+ # Create the actual XML string (as opposed to #to_xml which, for historic reasons,
246
+ # returns an REXML document)
247
+ # @param [Boolean] fast_but_unsafe Use the fast MARC::UnsafeXMLWriter code
248
+ # @param [Boolean] include_namespace Include namespaces on the <record> tag?
249
+ # @return [String] MARC-XML encoding of the record
250
+ def to_xml_string(fast_but_unsafe: false, include_namespace: true)
251
+ if fast_but_unsafe
252
+ MARC::UnsafeXMLWriter.encode(self, include_namespace: include_namespace)
253
+ else
254
+ MARC::XMLWriter.encode(self, include_namespace: include_namespace).to_s
255
+ end
238
256
  end
239
257
 
240
258
  # Handy method for returning a hash mapping this records values
@@ -244,95 +262,87 @@ module MARC
244
262
  # print dc['title']
245
263
 
246
264
  def to_dublin_core
247
- return MARC::DublinCore.map(self)
265
+ MARC::DublinCore.map(self)
248
266
  end
249
267
 
250
268
  # Return a marc-hash version of the record
251
269
  def to_marchash
252
- return {
253
- 'type' => 'marc-hash',
254
- 'version' => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION],
255
- 'leader' => self.leader,
256
- 'fields' => self.map { |f| f.to_marchash }
257
- }
270
+ {"type" => "marc-hash", "version" => [MARCHASH_MAJOR_VERSION, MARCHASH_MINOR_VERSION], "leader" => leader, "fields" => map { |f| f.to_marchash }}
258
271
  end
259
272
 
260
- #to_hash
261
-
262
273
  # Factory method for creating a new MARC::Record from
263
274
  # a marchash object
264
275
  #
265
276
  # record = MARC::Record->new_from_marchash(mh)
266
277
 
267
278
  def self.new_from_marchash(mh)
268
- r = self.new()
269
- r.leader = mh['leader']
270
- mh['fields'].each do |f|
271
- if (f.length == 2)
279
+ r = new
280
+ r.leader = mh["leader"]
281
+ mh["fields"].each do |f|
282
+ if f.length == 2
272
283
  r << MARC::ControlField.new(f[0], f[1])
273
284
  elsif r << MARC::DataField.new(f[0], f[1], f[2], *f[3])
274
285
  end
275
286
  end
276
- return r
287
+ r
277
288
  end
278
289
 
279
-
280
290
  # Returns a (roundtrippable) hash representation for MARC-in-JSON
281
291
  def to_hash
282
- record_hash = {'leader' => @leader, 'fields' => []}
292
+ record_hash = {"leader" => @leader, "fields" => []}
283
293
  @fields.each do |field|
284
- record_hash['fields'] << field.to_hash
294
+ record_hash["fields"] << field.to_hash
285
295
  end
286
296
  record_hash
287
297
  end
288
298
 
299
+ # Return an actual json-encoded string.
300
+ def to_json_string
301
+ MARC::JSONLWriter.encode(self)
302
+ end
303
+
289
304
  def self.new_from_hash(h)
290
- r = self.new
291
- r.leader = h['leader']
292
- if h['fields']
293
- h['fields'].each do |position|
294
- position.each_pair do |tag, field|
295
- if field.is_a?(Hash)
296
- f = MARC::DataField.new(tag, field['ind1'], field['ind2'])
297
- field['subfields'].each do |pos|
298
- pos.each_pair do |code, value|
299
- f.append MARC::Subfield.new(code, value)
300
- end
305
+ r = new
306
+ r.leader = h["leader"]
307
+ h["fields"]&.each do |position|
308
+ position.each_pair do |tag, field|
309
+ if field.is_a?(Hash)
310
+ f = MARC::DataField.new(tag, field["ind1"], field["ind2"])
311
+ field["subfields"].each do |pos|
312
+ pos.each_pair do |code, value|
313
+ f.append MARC::Subfield.new(code, value)
301
314
  end
302
- r << f
303
- else
304
- r << MARC::ControlField.new(tag, field)
305
315
  end
316
+ r << f
317
+ else
318
+ r << MARC::ControlField.new(tag, field)
306
319
  end
307
320
  end
308
321
  end
309
- return r
322
+ r
310
323
  end
311
324
 
312
325
  # Returns a string version of the record, suitable for printing
313
326
 
314
327
  def to_s
315
328
  str = "LEADER #{leader}\n"
316
- self.each do |field|
317
- str += field.to_s() + "\n"
329
+ each do |field|
330
+ str += field.to_s + "\n"
318
331
  end
319
- return str
332
+ str
320
333
  end
321
334
 
322
-
323
335
  # For testing if two records can be considered equal.
324
336
 
325
337
  def ==(other)
326
- return self.to_s == other.to_s
338
+ to_s == other.to_s
327
339
  end
328
340
 
329
-
330
341
  # Handy for using a record in a regex:
331
342
  # if record =~ /Gravity's Rainbow/ then print "Slothrop" end
332
343
 
333
344
  def =~(regex)
334
- return self.to_s =~ regex
345
+ to_s =~ regex
335
346
  end
336
-
337
347
  end
338
348
  end
data/lib/marc/subfield.rb CHANGED
@@ -1,31 +1,33 @@
1
1
  module MARC
2
-
3
- # A class that represents an individual subfield within a DataField.
4
- # Accessor attributes include: code (letter subfield code) and value
5
- # (the content of the subfield). Both can be empty string, but should
6
- # not be set to nil.
2
+ # A class that represents an individual subfield within a DataField.
3
+ # Accessor attributes include: code (letter subfield code) and value
4
+ # (the content of the subfield). Both can be empty string, but should
5
+ # not be set to nil.
7
6
 
8
7
  class Subfield
9
8
  attr_accessor :code, :value
10
9
 
11
- def initialize(code='' ,value='')
10
+ def initialize(code = "", value = "")
12
11
  # can't allow code of value to be nil
13
12
  # or else it'll screw us up later on
14
- @code = code == nil ? '' : code
15
- @value = value == nil ? '' : value
13
+ @code = code.nil? ? "" : code
14
+ @value = value.nil? ? "" : value
16
15
  end
17
16
 
18
17
  def ==(other)
18
+ if !other.is_a?(Subfield)
19
+ return false
20
+ end
19
21
  if @code != other.code
20
22
  return false
21
23
  elsif @value != other.value
22
24
  return false
23
25
  end
24
- return true
26
+ true
25
27
  end
26
28
 
27
29
  def to_s
28
- return "$#{code} #{value} "
30
+ "$#{code} #{value} "
29
31
  end
30
32
  end
31
33
  end
@@ -0,0 +1,93 @@
1
+ require "marc/xmlwriter"
2
+
3
+ module MARC
4
+ # UnsafeXMLWriter bypasses real xml handlers like REXML or Nokogiri and just concatenates strings
5
+ # to produce the XML document. This has no guarantees of validity if the MARC record you're encoding
6
+ # isn't valid and won't do things like entity expansion, but it does escape using ruby's
7
+ # String#encode(xml: :text) and it's much, much faster -- 4-5 times faster than using Nokogiri,
8
+ # and 15-20 times faster than the REXML version.
9
+ class UnsafeXMLWriter < MARC::XMLWriter
10
+ XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'
11
+ NS_ATTRS = %(xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/MARC21/slim" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd")
12
+
13
+ NS_COLLECTION = "<collection #{NS_ATTRS}>".freeze
14
+ COLLECTION = "<collection>".freeze
15
+ NS_RECORD = "<record #{NS_ATTRS}>".freeze
16
+ RECORD = "<record>".freeze
17
+
18
+ # Write the record to the target
19
+ # @param [MARC::Record] record
20
+ def write(record)
21
+ @fh.write(self.class.encode(record))
22
+ end
23
+
24
+ class << self
25
+ # Open `collection` tag, w or w/o namespace
26
+ def open_collection(include_namespace: true)
27
+ if include_namespace
28
+ NS_COLLECTION
29
+ else
30
+ COLLECTION
31
+ end
32
+ end
33
+
34
+ def open_record(include_namespace: true)
35
+ if include_namespace
36
+ NS_RECORD
37
+ else
38
+ RECORD
39
+ end
40
+ end
41
+
42
+ # Produce an XML string with a single document in a collection
43
+ # @param [MARC::Record] record
44
+ # @param [Boolean] include_namespace Whether to namespace the resulting XML
45
+ def single_record_document(record, include_namespace: true)
46
+ xml = XML_HEADER.dup
47
+ xml << open_collection(include_namespace: include_namespace)
48
+ xml << encode(record, include_namespace: false)
49
+ xml << "</collection>".freeze
50
+ xml
51
+ end
52
+
53
+ # Take a record and turn it into a valid MARC-XML string. Note that
54
+ # this is an XML _snippet_, without an XML header or <collection>
55
+ # enclosure.
56
+ # @param [MARC::Record] record The record to encode to XML
57
+ # @return [String] The XML snippet of the record in MARC-XML
58
+ def encode(record, include_namespace: true)
59
+ xml = open_record(include_namespace: include_namespace).dup
60
+
61
+ # MARCXML only allows alphanumerics or spaces in the leader
62
+ lead = fix_leader(record.leader)
63
+
64
+ xml << "<leader>" << lead.encode(xml: :text) << "</leader>"
65
+ record.each do |f|
66
+ if f.instance_of?(MARC::DataField)
67
+ xml << open_datafield(f.tag, f.indicator1, f.indicator2)
68
+ f.each do |sf|
69
+ xml << open_subfield(sf.code) << sf.value.encode(xml: :text) << "</subfield>"
70
+ end
71
+ xml << "</datafield>"
72
+ elsif f.instance_of?(MARC::ControlField)
73
+ xml << open_controlfield(f.tag) << f.value.encode(xml: :text) << "</controlfield>"
74
+ end
75
+ end
76
+ xml << "</record>"
77
+ xml.force_encoding("utf-8")
78
+ end
79
+
80
+ def open_datafield(tag, ind1, ind2)
81
+ "<datafield tag=\"#{tag}\" ind1=\"#{ind1}\" ind2=\"#{ind2}\">"
82
+ end
83
+
84
+ def open_subfield(code)
85
+ "<subfield code=\"#{code}\">"
86
+ end
87
+
88
+ def open_controlfield(tag)
89
+ "<controlfield tag=\"#{tag}\">"
90
+ end
91
+ end
92
+ end
93
+ end
data/lib/marc/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module MARC
2
- VERSION = "1.0.4"
2
+ VERSION = "1.2.0"
3
3
  end
data/lib/marc/writer.rb CHANGED
@@ -1,5 +1,4 @@
1
1
  module MARC
2
-
3
2
  # A class for writing MARC records as binary MARC (ISO 2709)
4
3
  #
5
4
  # == Too-long records
@@ -29,65 +28,65 @@ module MARC
29
28
  # the constructor which you must pass a file path
30
29
  # or an object that responds to a write message
31
30
 
32
- def initialize(file)
33
- if file.class == String
34
- @fh = File.new(file,"w")
35
- elsif file.respond_to?('write')
31
+ def initialize(file, &blk)
32
+ if file.instance_of?(String)
33
+ @fh = File.new(file, "w")
34
+ elsif file.respond_to?(:write)
36
35
  @fh = file
37
36
  else
38
37
  raise ArgumentError, "must pass in file name or handle"
39
38
  end
40
39
  self.allow_oversized = false
41
- end
42
40
 
41
+ if block_given?
42
+ blk.call(self)
43
+ self.close
44
+ end
45
+ end
43
46
 
44
47
  # write a record to the file or handle
45
48
 
46
49
  def write(record)
47
- @fh.write(MARC::Writer.encode(record, self.allow_oversized))
50
+ @fh.write(MARC::Writer.encode(record, allow_oversized))
48
51
  end
49
52
 
50
-
51
53
  # close underlying filehandle
52
54
 
53
55
  def close
54
56
  @fh.close
55
57
  end
56
58
 
57
-
58
59
  # a static method that accepts a MARC::Record object
59
60
  # and returns the record encoded as MARC21 in transmission format
60
61
  #
61
62
  # Second arg allow_oversized, default false, set to true
62
- # to raise on MARC record that can't fit into ISO 2709.
63
+ # to raise on MARC record that can't fit into ISO 2709.
63
64
  def self.encode(record, allow_oversized = false)
64
- directory = ''
65
- fields = ''
65
+ directory = ""
66
+ fields = ""
66
67
  offset = 0
67
68
  record.each do |field|
68
-
69
69
  # encode the field
70
- field_data = ''
71
- if field.class == MARC::DataField
70
+ field_data = ""
71
+ if field.instance_of?(MARC::DataField)
72
72
  warn("Warn: Missing indicator") unless field.indicator1 && field.indicator2
73
73
  field_data = (field.indicator1 || " ") + (field.indicator2 || " ")
74
- for s in field.subfields
74
+ field.subfields.each do |s|
75
75
  field_data += SUBFIELD_INDICATOR + s.code + s.value
76
76
  end
77
- elsif field.class == MARC::ControlField
77
+ elsif field.instance_of?(MARC::ControlField)
78
78
  field_data = field.value
79
79
  end
80
80
  field_data += END_OF_FIELD
81
81
 
82
82
  # calculate directory entry for the field
83
83
  field_length = (field_data.respond_to?(:bytesize) ?
84
- field_data.bytesize() :
85
- field_data.length())
84
+ field_data.bytesize :
85
+ field_data.length)
86
86
  directory += sprintf("%03s", field.tag) + format_byte_count(field_length, allow_oversized, 4) + format_byte_count(offset, allow_oversized)
87
87
 
88
-
89
88
  # add field to data for other fields
90
- fields += field_data
89
+ fields += field_data
91
90
 
92
91
  # update offset for next field
93
92
  offset += field_length
@@ -100,19 +99,18 @@ module MARC
100
99
  marc = base + fields + END_OF_RECORD
101
100
 
102
101
  # update leader with the byte offest to the end of the directory
103
- bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length()
102
+ bytesize = base.respond_to?(:bytesize) ? base.bytesize() : base.length
104
103
  marc[12..16] = format_byte_count(bytesize, allow_oversized)
105
-
106
104
 
107
105
  # update the record length
108
- bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length()
109
- marc[0..4] = format_byte_count(bytesize, allow_oversized)
106
+ bytesize = marc.respond_to?(:bytesize) ? marc.bytesize() : marc.length
107
+ marc[0..4] = format_byte_count(bytesize, allow_oversized)
110
108
 
111
109
  # store updated leader in the record that was passed in
112
- record.leader = marc[0..LEADER_LENGTH-1]
110
+ record.leader = marc[0..LEADER_LENGTH - 1]
113
111
 
114
112
  # return encoded marc
115
- return marc
113
+ marc
116
114
  end
117
115
 
118
116
  # Formats numbers for insertion into marc binary slots.
@@ -123,7 +121,7 @@ module MARC
123
121
  #
124
122
  # first arg is number, second is boolean whether to allow oversized,
125
123
  # third is max digits (default 5)
126
- def self.format_byte_count(number, allow_oversized, num_digits=5)
124
+ def self.format_byte_count(number, allow_oversized, num_digits = 5)
127
125
  formatted = sprintf("%0#{num_digits}i", number)
128
126
  if formatted.length > num_digits
129
127
  # uh, oh, we've exceeded our max. Either zero out
@@ -134,8 +132,7 @@ module MARC
134
132
  raise MARC::Exception.new("Can't write MARC record in binary format, as a length/offset value of #{number} is too long for a #{num_digits}-byte slot.")
135
133
  end
136
134
  end
137
- return formatted
135
+ formatted
138
136
  end
139
-
140
137
  end
141
138
  end