traject 1.0.0.beta.7 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ccc1ca6c023f01b9e21148a3ca37d86858df8065
4
- data.tar.gz: 4360b687f3ba8d957d4a14d39bc6cefbd443bcdd
3
+ metadata.gz: 31243d453f43fbc8f8634c2511340d8fb96c606f
4
+ data.tar.gz: a843a583f235920931304fc6baa7ff75ce91dfab
5
5
  SHA512:
6
- metadata.gz: dba976f724960b4048c2db65827780a46a2c64c442b2f48aa077375775f9f3a8901f446f226fd26f8e69cddbb696b3c114403c592b20cdc362bc43161a25f608
7
- data.tar.gz: 7216d83c97aa75698d2c03dec2bcc263decf4cc3d39a62187e9b0b09856fa1cac45e4fcb524d633f8a77acbf474343a69fec9b0324b66f056eb888915396e95b
6
+ metadata.gz: 5a0e6f3c695a5fe497f4cadd14fba7a2a10a3ea1ff40e42a6625949227d1eb3f995dc4d0ecb228f5e98c95f737df1c1e0e8089b59e17fcb0fedca8de5208b535
7
+ data.tar.gz: 3598e91ed10b039c4382d56ddc69939713425c4f059eb27adfddfdc272e25f2c52e8f900a31b94ceda6576d716c2155188dc03d4697156314e71aadf99a92471
data/README.md CHANGED
@@ -6,7 +6,7 @@ Might be used to index MARC data for a Solr-based discovery product like [Blackl
6
6
  Traject might also be generalized to a set of tools for getting structured data from a source, and transforming it to a hash-like object to send to a destination.
7
7
 
8
8
 
9
- **Traject is nearing 1.0, it is robust, feature-rich and being used in production by authors -- feedback invited**
9
+ **Traject is stable, mature software, that is already being used in production by its authors.**
10
10
 
11
11
  [![Gem Version](https://badge.fury.io/rb/traject.png)](http://badge.fury.io/rb/traject)
12
12
  [![Build Status](https://travis-ci.org/traject-project/traject.png)](https://travis-ci.org/traject-project/traject)
@@ -89,13 +89,11 @@ settings do
89
89
  # various others...
90
90
  provide "solrj_writer.commit_on_close", "true"
91
91
 
92
- # By default, we use the Traject::Marc4JReader, which
93
- # can read marc8 and ISO8859_1 -- if your records are all in UTF8,
94
- # the pure-ruby MarcReader may be faster...
95
- # provide "reader_class_name", "Traject::MarcReader"
96
- # If you ARE using the Marc4JReader, it defaults to "BESTGUESS"
97
- # as to encoding when reading binary, you may want to tell it instead
98
- provide "marc4j_reader.source_encoding", "MARC8" # or UTF-8 or ISO8859_1
92
+ # By default, we use the Traject::MarcReader
93
+ # One altenrnative is the Marc4JReader, using Marc4J.
94
+ # provide "reader_class_name", "Traject::Marc4Reader"
95
+ # If we're reading binary MARC, it's best to tell it the encoding.
96
+ provide "marc4j_reader.source_encoding", "MARC-8" # or 'UTF-8' or 'ISO-8859-1' or whatever.
99
97
  end
100
98
  ~~~
101
99
 
@@ -224,7 +224,7 @@ object you configure yourself however you like:
224
224
  ~~~ruby
225
225
  # inside a traject configuration file
226
226
 
227
- logger = Yell.new do |l|
227
+ self.logger = Yell.new do |l|
228
228
  l.level = 'gte.info' # will only pass :info and above to the adapters
229
229
 
230
230
  l.adapter :datefile, 'production.log', level: 'lte.warn' # anything lower or equal to :warn
@@ -232,6 +232,10 @@ object you configure yourself however you like:
232
232
  end
233
233
  ~~~
234
234
 
235
+ **note** it's important to use to use `self.logger =`, or due to
236
+ ruby idiosyncracies you'll just be setting a local variable, not the Indexer's
237
+ logger attribute.
238
+
235
239
  See [yell](https://github.com/rudionrails/yell) docs for more, you can
236
240
  do whatever you can make yell, just write ruby.
237
241
 
@@ -47,8 +47,10 @@ settings are applied first of all. It's recommended you use `provide`.
47
47
  * `log.level`: Log this level and above. Default 'info', set to eg 'debug' to get potentially more logging info,
48
48
  or 'error' to get less. https://github.com/rudionrails/yell/wiki/101-setting-the-log-level
49
49
 
50
- * `log.batch_size`: If set to a number N (or string representation), will output a progress line to DEBUG
51
- log, every N records. (use -d to turn logging to DEBUG to see.)
50
+ * `log.batch_size`: If set to a number N (or string representation), will output a progress line to
51
+ log. (by default as INFO, but see log.batch_size.severity)
52
+
53
+ * `log.batch_size.severity`: If `log.batch_size` is set, what logger severity level to log to. Default "INFO", set to "DEBUG" etc if desired.
52
54
 
53
55
  * `marc_source.type`: default 'binary'. Can also set to 'xml' or (not yet implemented todo) 'json'. Command line shortcut `-t`
54
56
 
@@ -332,7 +332,7 @@ class Traject::Indexer
332
332
  if log_batch_size && (count % log_batch_size == 0)
333
333
  batch_rps = log_batch_size / (Time.now - batch_start_time)
334
334
  overall_rps = count / (Time.now - start_time)
335
- logger.debug "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall"
335
+ logger.send(settings["log.batch_size.severity"].downcase.to_sym, "Traject::Indexer#process, read #{count} records at id:#{id_string(record)}; #{'%.0f' % batch_rps}/s this batch, #{'%.0f' % overall_rps}/s overall")
336
336
  batch_start_time = Time.now
337
337
  end
338
338
 
@@ -62,14 +62,14 @@ class Traject::Indexer
62
62
 
63
63
  def self.defaults
64
64
  @@defaults ||= {
65
- "reader_class_name" => "Traject::Marc4JReader",
65
+ "reader_class_name" => "Traject::MarcReader",
66
66
  "writer_class_name" => "Traject::SolrJWriter",
67
- "marc_source.type" => "binary",
67
+ "marc_source.type" => "binary",
68
68
  "marc4j_reader.permissive" => true,
69
- "marc4j_reader.source_encoding" => "BESTGUESS",
70
69
  "solrj_writer.batch_size" => 200,
71
70
  "solrj_writer.thread_pool" => 1,
72
- "processing_thread_pool" => 3
71
+ "processing_thread_pool" => 3,
72
+ "log.batch_size.severity" => "info"
73
73
  }
74
74
  end
75
75
 
@@ -3,9 +3,8 @@ require 'marc'
3
3
  require 'marc/marc4j'
4
4
 
5
5
  # `Traject::Marc4JReader` uses the marc4j java package to parse the MARC records
6
- # into standard ruby-marc MARC::Record objects. This reader is often faster than
7
- # Traject::MarcReader, especially for XML, and offers support for reading Marc8
8
- # encoded records and transcoding to UTF8.
6
+ # into standard ruby-marc MARC::Record objects. This reader may be faster than
7
+ # Traject::MarcReader, especially for XML.
9
8
  #
10
9
  # Marc4JReader can read MARC ISO 2709 ("binary") or MARCXML. We use the Marc4J MarcPermissiveStreamReader
11
10
  # for reading binary, but sometimes in non-permissive mode, according to settings. We use the Marc4j MarcXmlReader
@@ -24,13 +23,15 @@ require 'marc/marc4j'
24
23
  # value to 'permissive' arg of MarcPermissiveStreamReader constructor.
25
24
  # Only used for 'binary'
26
25
  #
27
- # * marc4j_reader.source_encoding: Only used for 'binary', otherwise always UTF-8.
26
+ # * marc_source.encoding: Only used for 'binary', otherwise always UTF-8.
28
27
  # String of the values MarcPermissiveStreamReader accepts:
29
- # * BESTGUESS (tries to use MARC leader and believe it, I think)
30
- # * ISO8859_1
28
+ # * BESTGUESS (default: not entirely clear what Marc4J does with this)
29
+ # * ISO-8859-1 (also accepted: ISO8859_1)
31
30
  # * UTF-8
32
- # * MARC8
33
- # Default 'BESTGUESS', but marc records in the wild are so wrong here, recommend setting.
31
+ # * MARC-8 (also accepted: MARC8)
32
+ # Default 'BESTGUESS', but HIGHLY recommend setting
33
+ # to avoid some Marc4J unpredictability, Marc4J "BESTGUESS" can be unpredictable
34
+ # in a variety of ways.
34
35
  # (will ALWAYS be transcoded to UTF-8 on the way out. We insist.)
35
36
  #
36
37
  # * marc4j_reader.jar_dir: Path to a directory containing Marc4J jar file to use. All .jar's in dir will
@@ -54,7 +55,7 @@ require 'marc/marc4j'
54
55
  #
55
56
  # # Or instead for binary:
56
57
  # provide "marc4j_reader.permissive", true
57
- # provide "marc4j_reader.source_encoding", "MARC8"
58
+ # provide "marc_source.encoding", "MARC8"
58
59
  # end
59
60
  class Traject::Marc4JReader
60
61
  include Enumerable
@@ -94,6 +95,20 @@ class Traject::Marc4JReader
94
95
  settings["marc_source.type"]
95
96
  end
96
97
 
98
+ def specified_source_encoding
99
+ #settings["marc4j_reader.source_encoding"]
100
+ enc = settings["marc_source.encoding"]
101
+
102
+ # one is standard for ruby and we want to support,
103
+ # the other is used by Marc4J and we have to pass it to Marc4J
104
+ enc = "ISO8859_1" if enc == "ISO-8859-1"
105
+
106
+ # default
107
+ enc = "BESTGUESS" if enc.nil? || enc.empty?
108
+
109
+ return enc
110
+ end
111
+
97
112
  def create_marc_reader!
98
113
  case input_type
99
114
  when "binary"
@@ -101,7 +116,7 @@ class Traject::Marc4JReader
101
116
 
102
117
  # #to_inputstream turns our ruby IO into a Java InputStream
103
118
  # third arg means 'convert to UTF-8, yes'
104
- MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, settings["marc4j_reader.source_encoding"])
119
+ MarcPermissiveStreamReader.new(input_stream.to_inputstream, permissive, true, specified_source_encoding)
105
120
  when "xml"
106
121
  MarcXmlReader.new(input_stream.to_inputstream)
107
122
  else
@@ -4,22 +4,33 @@ require 'traject/ndj_reader'
4
4
  # `Traject::MarcReader` uses pure ruby marc gem to parse MARC records. It
5
5
  # can read MARC ISO 2709 ('binary'), MARC-XML, and Marc-in-json (newline-delimited-json).
6
6
  #
7
- # MarcReader can not currently read binary MARC in the MARC8 encoding, see
8
- # the Traject::Marc4JReader instead.
7
+ # Marc4JReader is an alternative to this class, powered by Marc4J. You may be interested
8
+ # in comparing for performance, under your particular use case.
9
9
  #
10
10
  # By default assumes binary MARC encoding, please set marc_source.type setting
11
- # for XML or json.
11
+ # for XML or json. If binary, please set marc_source.encoding with char encoding.
12
12
  #
13
13
  # ## Settings
14
14
 
15
15
  # * "marc_source.type": serialization type. default 'binary'
16
- # * "binary". standard ISO 2709 "binary" MARC format.
17
- # * "xml", MarcXML
16
+ # * "binary". standard ISO 2709 "binary" MARC format,
17
+ # will use ruby-marc MARC::Reader (Note, if you are using
18
+ # type 'binary', you probably want to also set 'marc_source.encoding')
19
+ # * "xml", MarcXML, will use ruby-marc MARC::XMLReader
18
20
  # * "json" The "marc-in-json" format, encoded as newline-separated
19
21
  # json. (synonym 'ndj'). A simplistic newline-separated json, with no comments
20
22
  # allowed, and no unescpaed internal newlines allowed in the json
21
23
  # objects -- we just read line by line, and assume each line is a
22
24
  # marc-in-json. http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/
25
+ # will use Traject::NDJReader which uses MARC::Record.new_from_hash.
26
+ # * "marc_source.encoding": Only used for marc_source.type 'binary', character encoding
27
+ # of the source marc records. Can be any
28
+ # encoding recognized by ruby, OR 'MARC-8'. For 'MARC-8', content will
29
+ # be transcoded (by ruby-marc) to UTF-8 in internal MARC::Record Strings.
30
+ # Default nil, meaning let MARC::Reader use it's default, which will
31
+ # probably be Encoding.default_internal, which will probably be UTF-8.
32
+ # Right now Traject::MarcReader is hard-coded to transcode to UTF-8 as
33
+ # an internal encoding.
23
34
  # * "marc_reader.xml_parser": For XML type, which XML parser to tell Marc::Reader
24
35
  # to use. Anything recognized by [Marc::Reader :parser
25
36
  # argument](http://rdoc.info/github/ruby-marc/ruby-marc/MARC/XMLReader).
@@ -62,7 +73,9 @@ class Traject::MarcReader
62
73
  when 'json'
63
74
  Traject::NDJReader.new(self.input_stream, settings)
64
75
  else
65
- MARC::Reader.new(self.input_stream)
76
+ args = { :invalid => :replace }
77
+ args[:external_encoding] = settings["marc_source.encoding"]
78
+ MARC::Reader.new(self.input_stream, args)
66
79
  end
67
80
  end
68
81
  return @internal_reader
@@ -1,3 +1,3 @@
1
1
  module Traject
2
- VERSION = "1.0.0.beta.7"
2
+ VERSION = "1.0.0"
3
3
  end
@@ -20,7 +20,7 @@ describe "Marc4JReader" do
20
20
  first = array.first
21
21
 
22
22
  assert_kind_of MARC::Record, first
23
- assert first['245']['a'].encoding.name, "UTF-8"
23
+ assert_equal first['245']['a'].encoding.name, "UTF-8"
24
24
  end
25
25
 
26
26
  it "can skip a bad subfield code" do
@@ -37,7 +37,7 @@ describe "Marc4JReader" do
37
37
 
38
38
  it "reads Marc binary in Marc8 encoding" do
39
39
  file = File.new(support_file_path("one-marc8.mrc"))
40
- settings = Traject::Indexer::Settings.new("marc4j_reader.source_encoding" => "MARC8")
40
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8")
41
41
  reader = Traject::Marc4JReader.new(file, settings)
42
42
 
43
43
  array = reader.to_a
@@ -84,5 +84,53 @@ describe "Marc4JReader" do
84
84
  assert_kind_of Java::org.marc4j.marc.impl::RecordImpl, record.original_marc4j
85
85
  end
86
86
 
87
+ it "replaces unicode character reference in Marc8 transcode" do
88
+ file = File.new(support_file_path "escaped_character_reference.marc8.marc")
89
+ # due to marc4j idiosyncracies, this test will NOT pass with default source_encoding
90
+ # of "BESTGUESS", it only works if you explicitly set to MARC8. Doh.
91
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC8") # binary type is default
92
+ record = Traject::Marc4JReader.new(file, settings).to_a.first
93
+
94
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
95
+ end
96
+
97
+ describe "Marc4J Java Permissive Stream Reader" do
98
+ # needed for sanity check when our tests fail to see if Marc4J
99
+ # is not behaving how we think it should.
100
+ it "converts character references" do
101
+ file = File.new(support_file_path "escaped_character_reference.marc8.marc")
102
+ reader = MarcPermissiveStreamReader.new(file.to_inputstream, true, true, "MARC-8")
103
+ record = reader.next
104
+
105
+ field = record.getVariableField("260")
106
+ subfield = field.getSubfield('a'.ord)
107
+ value = subfield.getData
108
+
109
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", value
110
+ end
111
+ end
112
+
113
+ it "replaces bad byte in UTF8 marc" do
114
+ skip "Marc4J needs fixing on it's end" # Marc4J won't do this in 'permissive' mode, gah.
115
+
116
+ # Note this only works because the marc file DOES correctly
117
+ # have leader byte 9 set to 'a' for UTF8, otherwise Marc4J can't do it.
118
+ file = File.new(support_file_path "bad_utf_byte.utf8.marc")
119
+
120
+ settings = Traject::Indexer::Settings.new() # binary UTF8 type is default
121
+ reader = Traject::Marc4JReader.new(file, settings)
122
+
123
+ record = reader.to_a.first
124
+
125
+ value = record['300']['a']
126
+
127
+ assert_equal value.encoding.name, "UTF-8"
128
+ assert value.valid_encoding?, "Has valid encoding"
129
+ assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", record['300']['a']
130
+ end
131
+
132
+
133
+
134
+
87
135
 
88
136
  end
@@ -17,21 +17,74 @@ describe "Traject::MarcReader" do
17
17
  assert_equal 30, array.length
18
18
  end
19
19
 
20
- it "reads Marc binary" do
21
- file = File.new(support_file_path "test_data.utf8.mrc")
22
- settings = Traject::Indexer::Settings.new() # binary type is default
23
- reader = Traject::MarcReader.new(file, settings)
24
20
 
25
- array = reader.to_a
21
+ describe "MARC binary" do
22
+ it "reads" do
23
+ file = File.new(support_file_path "test_data.utf8.mrc")
24
+ settings = Traject::Indexer::Settings.new() # binary type is default
25
+ reader = Traject::MarcReader.new(file, settings)
26
26
 
27
- assert_equal 30, array.length
27
+ array = reader.to_a
28
28
 
29
- first = array.first
29
+ assert_equal 30, array.length
30
30
 
31
- assert_kind_of MARC::Record, first
31
+ first = array.first
32
32
 
33
- assert first['245']['a'].encoding.name, "UTF-8"
34
- assert_equal "Fikr-i Ayāz /", first['245']['a']
33
+ assert_kind_of MARC::Record, first
34
+
35
+ assert first['245']['a'].encoding.name, "UTF-8"
36
+ assert_equal "Fikr-i Ayāz /", first['245']['a']
37
+ end
38
+
39
+ it "reads Marc binary in Marc8 encoding, transcoding to UTF-8" do
40
+ file = File.new(support_file_path("one-marc8.mrc"))
41
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8")
42
+ reader = Traject::MarcReader.new(file, settings)
43
+
44
+ array = reader.to_a
45
+
46
+ assert_length 1, array
47
+
48
+
49
+ assert_kind_of MARC::Record, array.first
50
+ a245a = array.first['245']['a']
51
+
52
+ assert a245a.encoding.name, "UTF-8"
53
+ assert a245a.valid_encoding?
54
+ assert_equal "Por uma outra globalização :", a245a
55
+ end
56
+
57
+ it "replaces unicode character reference in Marc8 transcode" do
58
+ file = File.new(support_file_path("escaped_character_reference.marc8.marc"))
59
+
60
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "MARC-8") # binary type is default
61
+ record = Traject::MarcReader.new(file, settings).to_a.first
62
+
63
+ assert_equal "Rio de Janeiro escaped replacement char: \uFFFD .", record['260']['a']
64
+ end
65
+
66
+ it "raises on unrecognized encoding for binary type" do
67
+ file = File.new(support_file_path "one-marc8.mrc")
68
+ settings = Traject::Indexer::Settings.new("marc_source.encoding" => "ADFADFADF")
69
+ assert_raises(ArgumentError) do
70
+ record = Traject::MarcReader.new(file, settings).to_a.first
71
+ end
72
+ end
73
+
74
+ it "replaces bad byte in UTF8 marc binary" do
75
+ file = File.new(support_file_path "bad_utf_byte.utf8.marc")
76
+
77
+ settings = Traject::Indexer::Settings.new() # binary type is default
78
+ reader = Traject::MarcReader.new(file, settings)
79
+
80
+ record = reader.to_a.first
81
+
82
+ value = record['300']['a']
83
+
84
+ assert_equal value.encoding.name, "UTF-8"
85
+ assert value.valid_encoding?, "Has valid encoding"
86
+ assert_equal "This is a bad byte: '\uFFFD' and another: '\uFFFD'", value
87
+ end
35
88
  end
36
89
 
37
90
  it "reads JSON" do
@@ -52,4 +105,6 @@ describe "Traject::MarcReader" do
52
105
 
53
106
 
54
107
 
108
+
109
+
55
110
  end
@@ -0,0 +1 @@
1
+ 00083 a2200037 4500300004500000 aThis is a bad byte: '�' and another: '�'
@@ -0,0 +1 @@
1
+ 00138cam 2200049Ia 45000010008000002600080000082196384 aRio de Janeiro escaped replacement char: � .bEditora Record,c2000.
@@ -20,7 +20,7 @@ Gem::Specification.new do |spec|
20
20
  spec.extra_rdoc_files = spec.files.grep(%r{^doc/})
21
21
 
22
22
 
23
- spec.add_dependency "marc", ">= 0.7.1"
23
+ spec.add_dependency "marc", ">= 0.8.0"
24
24
  spec.add_dependency "marc-marc4j", ">=0.1.1" # use and convert marc4j
25
25
  spec.add_dependency "hashie", ">= 2.0.5", "< 2.1" # used for Indexer#settings
26
26
  spec.add_dependency "slop", ">= 3.4.5", "< 4.0" # command line parsing
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: traject
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.beta.7
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Rochkind
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-11-08 00:00:00.000000000 Z
12
+ date: 2013-11-25 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: marc
@@ -17,12 +17,12 @@ dependencies:
17
17
  requirements:
18
18
  - - '>='
19
19
  - !ruby/object:Gem::Version
20
- version: 0.7.1
20
+ version: 0.8.0
21
21
  requirement: !ruby/object:Gem::Requirement
22
22
  requirements:
23
23
  - - '>='
24
24
  - !ruby/object:Gem::Version
25
- version: 0.7.1
25
+ version: 0.8.0
26
26
  prerelease: false
27
27
  type: :runtime
28
28
  - !ruby/object:Gem::Dependency
@@ -211,11 +211,13 @@ files:
211
211
  - test/test_support/245_no_ab.marc
212
212
  - test/test_support/880_with_no_6.utf8.marc
213
213
  - test/test_support/bad_subfield_code.marc
214
+ - test/test_support/bad_utf_byte.utf8.marc
214
215
  - test/test_support/date_resort_to_260.marc
215
216
  - test/test_support/date_type_r_missing_date2.marc
216
217
  - test/test_support/date_with_u.marc
217
218
  - test/test_support/demo_config.rb
218
219
  - test/test_support/emptyish_record.marc
220
+ - test/test_support/escaped_character_reference.marc8.marc
219
221
  - test/test_support/george_eliot.marc
220
222
  - test/test_support/hebrew880s.marc
221
223
  - test/test_support/louis_armstrong.marc
@@ -281,12 +283,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
281
283
  version: '0'
282
284
  required_rubygems_version: !ruby/object:Gem::Requirement
283
285
  requirements:
284
- - - '>'
286
+ - - '>='
285
287
  - !ruby/object:Gem::Version
286
- version: 1.3.1
288
+ version: '0'
287
289
  requirements: []
288
290
  rubyforge_project:
289
- rubygems_version: 2.1.9
291
+ rubygems_version: 2.1.11
290
292
  signing_key:
291
293
  specification_version: 4
292
294
  summary: Index MARC to Solr; or generally process source records to hash-like structures
@@ -309,11 +311,13 @@ test_files:
309
311
  - test/test_support/245_no_ab.marc
310
312
  - test/test_support/880_with_no_6.utf8.marc
311
313
  - test/test_support/bad_subfield_code.marc
314
+ - test/test_support/bad_utf_byte.utf8.marc
312
315
  - test/test_support/date_resort_to_260.marc
313
316
  - test/test_support/date_type_r_missing_date2.marc
314
317
  - test/test_support/date_with_u.marc
315
318
  - test/test_support/demo_config.rb
316
319
  - test/test_support/emptyish_record.marc
320
+ - test/test_support/escaped_character_reference.marc8.marc
317
321
  - test/test_support/george_eliot.marc
318
322
  - test/test_support/hebrew880s.marc
319
323
  - test/test_support/louis_armstrong.marc