marc 1.0.4 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
  3. data/.github/workflows/ruby.yml +24 -0
  4. data/.gitignore +17 -0
  5. data/.standard.yml +1 -0
  6. data/{Changes → CHANGELOG.md} +106 -29
  7. data/Gemfile +15 -0
  8. data/README.md +240 -47
  9. data/Rakefile +14 -14
  10. data/bin/marc +14 -0
  11. data/bin/marc2xml +17 -0
  12. data/examples/xml2marc.rb +10 -0
  13. data/lib/marc/constants.rb +3 -3
  14. data/lib/marc/controlfield.rb +35 -23
  15. data/lib/marc/datafield.rb +70 -63
  16. data/lib/marc/dublincore.rb +59 -41
  17. data/lib/marc/exception.rb +9 -1
  18. data/lib/marc/jsonl_reader.rb +33 -0
  19. data/lib/marc/jsonl_writer.rb +44 -0
  20. data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
  21. data/lib/marc/marc8/to_unicode.rb +80 -86
  22. data/lib/marc/reader.rb +119 -121
  23. data/lib/marc/record.rb +72 -62
  24. data/lib/marc/subfield.rb +12 -10
  25. data/lib/marc/unsafe_xmlwriter.rb +93 -0
  26. data/lib/marc/version.rb +1 -1
  27. data/lib/marc/writer.rb +27 -30
  28. data/lib/marc/xml_parsers.rb +222 -197
  29. data/lib/marc/xmlreader.rb +131 -114
  30. data/lib/marc/xmlwriter.rb +93 -81
  31. data/lib/marc.rb +20 -18
  32. data/marc.gemspec +23 -0
  33. data/test/marc8/tc_marc8_mapping.rb +3 -3
  34. data/test/marc8/tc_to_unicode.rb +28 -32
  35. data/test/messed_up_leader.xml +9 -0
  36. data/test/tc_controlfield.rb +37 -34
  37. data/test/tc_datafield.rb +65 -60
  38. data/test/tc_dublincore.rb +9 -11
  39. data/test/tc_hash.rb +10 -13
  40. data/test/tc_jsonl.rb +19 -0
  41. data/test/tc_marchash.rb +17 -21
  42. data/test/tc_parsers.rb +108 -144
  43. data/test/tc_reader.rb +35 -36
  44. data/test/tc_reader_char_encodings.rb +149 -169
  45. data/test/tc_record.rb +143 -148
  46. data/test/tc_subfield.rb +14 -13
  47. data/test/tc_unsafe_xml.rb +95 -0
  48. data/test/tc_writer.rb +101 -108
  49. data/test/tc_xml.rb +99 -87
  50. data/test/tc_xml_error_handling.rb +7 -8
  51. data/test/ts_marc.rb +8 -8
  52. metadata +94 -9
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 474c3ee37225584b3e5f189ff5f49507b82741da82aef0dd68a7e39180d25874
4
- data.tar.gz: f0d272c5171827dcfa327ae8d079ee0ada52ba8da5b981378f7c2352d16a7a0e
3
+ metadata.gz: 78a543b07dbaa8d6aeff40421038d6d24649bc1769db2f286017e9205428c086
4
+ data.tar.gz: ed2b006b3f4c32ec718ede1a5d56435311970144a8208821733c6089e82bfea7
5
5
  SHA512:
6
- metadata.gz: 04361e464361334b874b737e292e58acae879c3d086a68d5292abb5b8ccd9c28370173c3a60ca6d5700ef8dbd941b607a888ef9c7c700623d4440f577423217d
7
- data.tar.gz: 22162879382120991f8a76484c2de73ac5f2aa8daf163bb1de0977723102876e0c2f2a81b9e04d2f4c5beda189e4828edc4673c37bf8f02e3a4a1235a2abaf16
6
+ metadata.gz: 6a53a6fafee92dad72b644668a5b1adabcd0cdb1af6b2ed87d7f8c97d5e56415bcf1e990f113cb5f459807e792e2ac5846948e116534dbfdfd31061c1a44f905
7
+ data.tar.gz: 4b36af95c908ac282efe7f05f1205b66f8c6c6f62c9f750843012512360c5695af15c5bb23759ccad2703d6c0304540915c45ae3259068fe8a7cb1497f1237f6
@@ -0,0 +1,30 @@
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: ''
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Include, if possible, a sample file that exhibits the behavior
16
+ 2. Include minimal but relevant ruby-marc code that exhibits the behavor
17
+
18
+ **Expected behavior**
19
+ A clear and concise description of what you expected to happen.
20
+
21
+ **Program Output**
22
+ If applicable, add program output and/or backtraces
23
+
24
+ **Environment (please complete the following information):**
25
+ - ruby-marc version (from `MARC::VERSION`)
26
+ - ruby runtime and version (best: the output of `ruby -e 'puts RUBY_DESCRIPTION'`)
27
+ - operating system, if not included in output of `RUBY_DESCRIPTION`
28
+
29
+ **Additional context**
30
+ Add any other context about the problem here.
@@ -0,0 +1,24 @@
1
+ name: CI
2
+
3
+ on: [push, pull_request]
4
+
5
+ env:
6
+ # See https://github.com/jruby/jruby/issues/5509
7
+ JAVA_OPTS: "--add-opens java.xml/com.sun.org.apache.xerces.internal.impl=org.jruby.dist"
8
+
9
+ jobs:
10
+ tests:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ ruby: [2.5, 2.6, 2.7, 3.0, jruby]
15
+ steps:
16
+ - uses: actions/checkout@v2
17
+ - name: Set up Ruby
18
+ uses: ruby/setup-ruby@v1
19
+ with:
20
+ ruby-version: ${{ matrix.ruby }}
21
+ - name: Install dependencies
22
+ run: bundle install --without documentation
23
+ - name: Run tests
24
+ run: bundle exec rake
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.standard.yml ADDED
@@ -0,0 +1 @@
1
+ ruby_version: 2.3
@@ -1,14 +1,92 @@
1
- v1.0.2 July 2017
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ ## [1.2] - 2022-08-02
6
+
7
+ ### Added
8
+
9
+ * New XML writer `MARC::UnsafeXMLWriter` which is 15-20 times faster than the
10
+ default (rexml-based) writer. It mirrors code from the old
11
+ [`MARC::FastXMLWriter` gem](https://github.com/billdueber/marc-fastxmlwriter)
12
+ in a way that integrates better with the existing writer framework. It can
13
+ be used like any other writer,
14
+ e.g., `writer = MARC::UnsafeXMLWriter. new(filename)`. Note that while it
15
+ is "unsafe" in that it doesn't do checks for valid XML going out (it's speed
16
+ comes from the fact that it's just concatenating strings together),
17
+ the `FastXMLWriter` gem has been used "in the wild" for years and doesn't
18
+ seem to cause anyone any problems.
19
+ * Added a new method, `MARC::Record.to_xml_string` which produces a
20
+ valid `<record>...</record>` XML snippet. It takes an optional keyword
21
+ argument to include namespace attributes on the
22
+ `<record>` tag, and another to use the new unsafe generator as
23
+ `record.to_xml_string(fast_but_unsafe: true)`.
24
+ * Added first-class support for `.jsonl` (aka "newline-delimited json")
25
+ files using the marc-in-json format via `MARC::JSONLReader` and
26
+ `MARC::JSONLWriter` which read and write marc-in-json. `ruby-marc` has
27
+ supported `#to_hash` and `#from_hash` to deal with this format at the
28
+ individual record level for a long time; this just provides the
29
+ reader/writer scaffolding.
30
+ * Also added `MARC::Record.to_json_string` to get a marc-in-json string
31
+ representation (parallel to the new `#to_xml_string`)
32
+ * New option to xml readers to ignore any namespaces
33
+ via `reader = MARC::XMLReader.new(filename, ignore_namespace: true)`. While
34
+ the REXML MARC-XML reader can't handle
35
+ (and thus has always ignored XML namespaces), the Nokogiri-based version
36
+ will enforce namespaces if present. Useful only when you have
37
+ poorly-generated files where the XML namespace attributes are wonky.
38
+ * All writers will now self-close if used with a block (e.g.,
39
+ `MARC::Writer.new(filename) {|w| w.write(record)}`), parallel to the way
40
+ `File.open` works in regular ruby.
41
+ * XML writers will now take an optional keyword argument,
42
+ `include_namespace`, on both `#new` and `.encode`.
43
+
44
+ ### Changed
45
+ * Remove the `JREXML` parser, which apparently hasn't worked for years yet
46
+ also wasn't running in CI because the test are running under bundler,
47
+ which didn't load `jrexml`. Set to emit a warning to use nokogiri
48
+ instead and fall back to REXML.
49
+ * 10-15% speed improvement when parsing MARC-XML with nokogiri (PR #97,
50
+ billdueber)
51
+ * Added deprecation warnings when using the `libxml`, `jstax`, or `jrexml`
52
+ xml parsers. When introduced, Nokogiri under JRuby was iffy. It's now
53
+ stable on both MRI and JRuby and faster than any of the other
54
+ included options and should be preferred. (PR #98, billdueber)
55
+ * MARC fields are now validated in their own post-creation stage (PR #66,
56
+ cbeer)
57
+ * Reduce the noise when running tests (billdueber)
58
+ * Reformatted this CHANGELOG.md file and added examples/structure to
59
+ README.md.
60
+
61
+
62
+ ### Fixed
63
+ * MARC-XML has requirements on the leader that are applied when writing out
64
+ MARC-XML by `MARC::XMLWriter.encode`. Previous versions would actually
65
+ mutate the record being written, resulting in a silent modification to
66
+ a record just because you were writing it out. Changed to use a duplicate
67
+ (PR #73, cbeer)
68
+ * Guard against multiple character calls when parsing XML (PR #74, cbeer)
69
+ * Minor Dublin Core code fixes (PRs #83 and #84, fjorba)
70
+ * `JRubyStaxReader` now supports Java 9+ / JRuby 9.3+ (PR #87, dmolesUC)
71
+
72
+ ## [1.1.1] - 2021-06-07
73
+
74
+ - Fix a regression when normalizing indicator values when serializing marcxml
75
+
76
+ ## [1.1.0] - 2021-06-01
77
+ - Add support for additional valid subfield codes in marcxml
78
+
79
+ ## [1.0.2] - 2017-08-01
2
80
  - Now (correctly) throw an error if datafield string is the empty string
3
81
  (thanks to @bibliotechy)
4
82
 
5
- v1.0.1 February 2016
83
+ ## [1.0.1] - 2016-02-29
6
84
  - Non-user-facing change in implementation of FieldMap strictly for performance
7
85
 
8
- v1.0.0 January 2015
86
+ ## [1.0.0] - 2015-01-28
9
87
  - Mostly changes that deal with encoding, plus the plunge to a 1.0 release
10
88
 
11
- v0.5.0 April 2012
89
+ ## [0.5.0] April 2012
12
90
  - Extensive rewrite of MARC::Reader (ISO 2709 binary reader) to provide a
13
91
  fairly complete and consistent handing of char encoding issues in ruby 1.9.
14
92
  - This code is well covered by automated tests, but ends up complex, there
@@ -21,75 +99,75 @@ v0.5.0 April 2012
21
99
  non-unicode encodings to UTF-8 for you. This version will not do
22
100
  so unless you ask it to with correct arguments.
23
101
 
24
- v0.4.4 Sat Mar 03 14:55:00 EDT 2012
102
+ ## [0.4.4] Sat Mar 03 14:55:00 EDT 2012
25
103
  - Fixed performance regression: strict reader will parse about 5x faster now
26
104
  - Updated CHANGES file for first time in a long time :-)
27
105
 
28
- v0.3.0 Wed Sep 23 21:51:00 EDT 2009
106
+ ## [0.3.0] Wed Sep 23 21:51:00 EDT 2009
29
107
  - Nokogiri and jrexml parser integration added as well as Ruby 1.9 support
30
108
 
31
- v0.2.2 Tue Dec 30 09:50:33 EST 2008
109
+ ## [0.2.2] Tue Dec 30 09:50:33 EST 2008
32
110
  - DataField tags that are all numeric are now padded with leading zeros
33
111
 
34
- v0.2.1 Mon Aug 18 14:14:16 EDT 2008
112
+ ## [0.2.1] Mon Aug 18 14:14:16 EDT 2008
35
113
  - can now process records that have fields tags that are non-numeric (thanks
36
114
  Ross Singer)
37
115
 
38
- v0.2.0 Wed Jun 11 12:42:20 EDT 2008
116
+ ## [0.2.0] Wed Jun 11 12:42:20 EDT 2008
39
117
  - added newline to output generated by REXML::Formatters::Default to make
40
118
  it a bit more friendly. REXML::Formatters::Pretty and Transitive just
41
119
  don't do what I want (whitespace in weird places).
42
120
 
43
- v0.1.9 Thu Jun 5 12:00:01 EDT 2008
121
+ ## [0.1.9] - Thu Jun 5 12:00:01 EDT 2008
44
122
  - small docfix change in XMLReader
45
123
  - use REXML::Formatters::Default instead of deprecated REXML::Element.write
46
124
 
47
- v0.1.8 Tue Nov 13 22:51:03 EST 2007
125
+ ## [0.1.8] - Tue Nov 13 22:51:03 EST 2007
48
126
  - added examples directory
49
127
  - fixed problem with leading whitespace and the leader in xml reader
50
128
  (thanks Morgan Cundiff)
51
129
 
52
- v0.1.7 Mon Nov 12 09:33:57 EST 2007
130
+ ## [0.1.7] - Mon Nov 12 09:33:57 EST 2007
53
131
  - updated Record.to_marc documentation to be a bit more precise
54
132
  - removed doc references to MARC::Field which is no longer around
55
133
  - changed from Artistic to MIT License
56
134
 
57
- v0.1.6 Fri May 4 12:37:33 EDT 2007
135
+ ## [0.1.6] - Fri May 4 12:37:33 EDT 2007
58
136
  - fixed bad record length test
59
137
  - removed MARC::XMLWriter convert_to_utf8 which wasn't really working and
60
138
  shouldn't be there if it isn't good
61
139
  - added unescaping of entities to MARC::XMLReader
62
140
 
63
- v0.1.5 Tue May 1 16:50:02 EDT 2007
141
+ ## [0.1.5] - Tue May 1 16:50:02 EDT 2007
64
142
  - docfix in MARC::DataField (thanks Jason Ronallo)
65
143
  - multiple docfixes (thanks Jonathan Rochkind)
66
144
 
67
- v0.1.4 Tue Jan 2 15:45:53 EST 2007
145
+ ## [0.1.4] - Tue Jan 2 15:45:53 EST 2007
68
146
  - fixed bug in MARC::XMLWriter that was outputting all control field tags as 00z
69
147
  (thanks Ross Singer)
70
148
  - added :include_namespace option to MARC::XMLWriter::encode to include the
71
149
  marcxml namespace, which allows MARC::Record::to_xml to emit the namespace
72
150
  for a single record.
73
151
 
74
- v0.1.3 Tue Jan 2 12:56:36 EST 2007
152
+ ## [0.1.3] - Tue Jan 2 12:56:36 EST 2007
75
153
  - added ability to map a MARC record to the Dublin Core fields. Calling
76
154
  to_dublin_core on a MARC::Record returns a hash that has Dublin Core fields
77
155
  as the hash keys.
78
156
 
79
- v0.1.2 Thu Dec 21 18:46:01 EST 2007
157
+ ## [0.1.2] - Thu Dec 21 18:46:01 EST 2007
80
158
  - fixed MARC::Record::to_xml so that it actually is tested and works (thanks
81
159
  Ross Singer)
82
160
 
83
- v0.1.1
161
+ ## [0.1.1] -
84
162
  - added ability to pass File like objects to the constructor for
85
163
  MARC::XMLReader like MARC::Reader (thanks Jake Glenn)
86
164
 
87
- v0.1.0 Wed Dec 6 15:40:40 EST 2006
165
+ ## [0.1.0] - Wed Dec 6 15:40:40 EST 2006
88
166
  - fixed pretty xml when stylesheet is used
89
167
  - added value() to MARC::DataField
90
168
  - added Rakefile for testing/building
91
169
 
92
- v0.0.9 Tue Mar 28 10:02:16 CST 2006
170
+ ## [0.0.9] - Tue Mar 28 10:02:16 CST 2006
93
171
  - changed XMLWriter.write to output pretty-printed XML
94
172
  - normalized Text in XML output
95
173
  - added XMLWriter checks and replacements for bad subfield codes and indicator
@@ -102,7 +180,7 @@ v0.0.9 Tue Mar 28 10:02:16 CST 2006
102
180
  test.
103
181
  - added :stylesheet argument to XLMWriter.new
104
182
 
105
- v0.0.8 Mon Jan 16 22:31:00 EST 2006
183
+ ## [0.0.8] - Mon Jan 16 22:31:00 EST 2006
106
184
  - removed control tests out of tc_field.rb into tc_control.rb
107
185
  - fixed some formatting
108
186
  - changed control/field to controlfield/datafield
@@ -114,7 +192,7 @@ v0.0.8 Mon Jan 16 22:31:00 EST 2006
114
192
  - fixed xmlreader strip_ns which was rerturning Nil when no namespace
115
193
  was found on an element (exposed by namespace changes).
116
194
 
117
- v0.0.7 Mon Jan 2 21:39:28 CST 2006
195
+ ## [0.0.7] - Mon Jan 2 21:39:28 CST 2006
118
196
  - MARC::XMLWriter added
119
197
  - removed encode/decode methods in MARC::MARC21 into MARC::Writer and
120
198
  MARC::Reader respectively. This required pushing MARC21 specific constants
@@ -125,26 +203,25 @@ v0.0.7 Mon Jan 2 21:39:28 CST 2006
125
203
  - added xml reading tests
126
204
  - fixed indentation to be two spaces
127
205
 
128
- v0.0.6 Tue Oct 18 09:33:12 CDT 2005
206
+ ## [0.0.6] - Tue Oct 18 09:33:12 CDT 2005
129
207
  - MARC::MARC21::decode throws an exception when a directory can't be found.
130
208
  Exception is caught and ignored in MARC::ForgivingReader
131
209
 
132
- v0.0.5 Tue Oct 18 01:50:40 CDT 2005
210
+ ## [0.0.5] - Tue Oct 18 01:50:40 CDT 2005
133
211
  - when unspecified field indicators are forced to blanks
134
212
  - checking for when a field appears to not have indicators and subfields in
135
213
  which case the field is skipped entirely
136
214
 
137
- v0.0.4 Tue Oct 18 00:39:50 CDT 2005
215
+ ## [0.0.4] - Tue Oct 18 00:39:50 CDT 2005
138
216
  - fixed off by one error when reading in leader, previous versions were
139
217
  reading an extra character
140
218
 
141
- v0.0.3 Mon Oct 17 22:51:23 CDT 2005
219
+ ## [0.0.3] - Mon Oct 17 22:51:23 CDT 2005
142
220
  - added ForgivingReader class and support for reading records without using
143
221
  possibly faulty offsets when the user needs them.
144
222
 
145
- v0.0.2 Mon Oct 17 17:42:57 CDT 2005
223
+ ## [0.0.2] - Mon Oct 17 17:42:57 CDT 2005
146
224
  - updated version string to see if it'll fix some gem oddness
147
225
 
148
- v0.0.1 Mon Oct 10 10:29:20 CDT 2005
226
+ ## [0.0.1] - Mon Oct 10 10:29:20 CDT 2005
149
227
  - initial release
150
-
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source "https://rubygems.org"
2
+
3
+ group :test do
4
+ if RUBY_VERSION != "1.8.7"
5
+ gem "nokogiri"
6
+ end
7
+ gem "rake"
8
+ gem "rdoc"
9
+ gem "xml-simple"
10
+ gem "test-unit"
11
+ gem "warning"
12
+ end
13
+
14
+ # Specify your gem's dependencies in ..gemspec
15
+ gemspec
data/README.md CHANGED
@@ -1,64 +1,230 @@
1
1
  [![Gem Version](https://badge.fury.io/rb/marc.png)](http://badge.fury.io/rb/marc)
2
- [![Build Status](https://secure.travis-ci.org/ruby-marc/ruby-marc.png)](http://travis-ci.org/ruby-marc/ruby-marc)
2
+ ![Build Status](https://github.com/ruby-marc/ruby-marc/workflows/CI/badge.svg)
3
+ |
3
4
 
4
5
  marc is a ruby library for reading and writing MAchine Readable Cataloging
5
6
  (MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
6
7
 
7
- ## Usage
8
+ ## Usage
8
9
 
9
- require 'marc'
10
-
11
- # reading records from a batch file
12
- reader = MARC::Reader.new('marc.dat', :external_encoding => "MARC-8")
13
- for record in reader
14
- # print out field 245 subfield a
15
- puts record['245']['a']
16
- end
10
+ ### Basics
11
+
12
+ ```ruby
13
+
14
+ reader = MARC::Reader.new("myfile.mrc")
15
+ reader.each do |record|
16
+ first_245 = record["245"] #=> #<MARC::DataField...>
17
+ first_245.to_s #=> "245 04 $a The Texas ranger $h [sound recording] / $c Sung by Beale D. Taylor. "
18
+ first_245.value #=> "The Texas ranger[sound recording] /Sung by Beale D. Taylor."
19
+ first_245.codes #=> ["a", "h", "c"]
20
+ first_245["a"] #=> "The Texas ranger"
17
21
 
18
- # creating a record
19
- record = MARC::Record.new()
20
- record.append(MARC::DataField.new('100', '0', ' ', ['a', 'John Doe']))
22
+ # A record is an enumerable over its fields and thus can use things like
23
+ # #each, #select, #find, etc.
21
24
 
22
- # writing a record
23
- writer = MARC::Writer.new('marc.dat')
24
- writer.write(record)
25
- writer.close()
25
+ subject_fields = record.select{|f| f.tag =~ /\A6/}
26
26
 
27
- # writing a record as XML
28
- writer = MARC::XMLWriter.new('marc.xml')
29
- writer.write(record)
30
- writer.close()
31
-
32
- # encoding a record
33
- MARC::Writer.encode(record) # or record.to_marc
34
-
35
- MARC::Record provides `#to_hash` and `#from_hash` implementations that deal in ruby
36
- hash's that are compatible with the
37
- [marc-in-json](http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
38
- serialization format. You are responsible for serializing the hash to/from JSON yourself.
27
+ # Get author fields by supplying a list of tags
28
+ record.fields.each_by_tag(["100", "110", "111"]) do |field|
29
+ puts field.value
30
+ end
31
+ end
32
+ ```
39
33
 
40
- ## Installation
41
34
 
42
- gem install marc
35
+ ### Reading / Writing MARC21 binary data
43
36
 
44
- Or if you're using bundler, add to your Gemfile
37
+ ```ruby
38
+ require 'marc'
45
39
 
46
- gem 'marc'
47
-
48
- ## Character Encodings in 'binary' ISO-2709 MARC
40
+ # marc21 binary format uses MARC::Reader and MARC::Writer
41
+
42
+ reader = MARC::Reader.new('marc.dat')
43
+ reader.each do |record|
44
+ title = record["245"].value
45
+ puts title
46
+ end
47
+ ```
48
+
49
+ If you know you have another encoding, you can specify it
50
+
51
+ ```ruby
52
+ reader = MARC::Reader.new("marc.dat", external_encoding: "MARC-8")
53
+ ```
54
+
55
+ While generally used with files, you can also give a reader an IO object
56
+ (usually an already-opened file or a StringIO object)
57
+
58
+ ```ruby
59
+ marc_data = File.open("marc.dat")
60
+ reader = MARC::Reader.new(marc_data)
61
+ ```
62
+
63
+ Similarly, you can write to either a file or an IO-like object
64
+
65
+ ```ruby
66
+ writer = MARC::Writer.new("myfile.dat")
67
+ # writer = MARC::Writer.new(Zlib::GzipWriter.open("myfile.dat.gz"))
68
+
69
+ myrecords.each do |rec|
70
+ writer.write(rec)
71
+ end
72
+ writer.close
73
+ ```
74
+
75
+ ### Reading/Writing marc-in-json
76
+
77
+ [marc-in-json](https://rossfsinger.com/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
78
+ is a simple hash-based serialization format for MARC, often used with the
79
+ [jsonl](https://jsonlines.org/) (aka jsonlines or newline-delimited-json)
80
+ file format which puts one json structure on each line.
81
+
82
+ ```ruby
83
+
84
+ reader = MARC::JSONLReader.new("myfile.jsonl")
85
+ writer = MARC::JSONLWriter.new("my_other_file.jsonl")
86
+ reader.each do |record|
87
+ writer.write(record)
88
+ end
89
+ writer.close
90
+
91
+ ```
92
+
93
+ ### Reading/Writing MARC-XML
94
+
95
+ MARC-XML is an XML-based serialiation format for MARC records. It is,
96
+ generally speaking, a lot slower than using MARC21 or marc-in-json.
97
+
98
+ There are two XML parsers supported going forwards within the ruby-marc code
99
+ base: REXML (the first, and for a long time only, ruby XML parser based on
100
+ regular expressions) and Nokogiri. Both are compatible with both MRI ("normal") ruby and JRuby.
101
+
102
+ The Nokogiri parser is about 6x faster than using REXML. See performance
103
+ numbers, below.
104
+
105
+ At one time, it was difficult to install Nokogiri under MRI and impossible
106
+ under JRuby. Because of this historical blip, nokogiri is _not_
107
+ automatically included when doing `require "marc"` in your code. If you want
108
+ to use the Nokogiri-based parser, you must include it explicitly.
109
+
110
+ ```ruby
111
+ require "nokogiri"
112
+ require "marc"
113
+
114
+ reader = MARC::XMLReader.new("myfile.xml", parser: "nokogiri")
115
+ ```
116
+
117
+ The `parser` argument works as follows:
118
+
119
+ * if not included, REXML is used
120
+ * if "rexml" or "nokogiri", the appropriate parser will be used
121
+ * if "magic", the Nokogiri parser will be used if Nokogiri has been loaded;
122
+ otherwise it will fall back to using REXML.
123
+
124
+ ```ruby
125
+ # Use the best available
126
+ reader = MARC::XMLReader.new("my_file.xml", parser: "magic")
127
+ ```
128
+
129
+ ### "Self-closing" writers
49
130
 
50
- The Marc binary (ISO 2709) Reader (MARC::Reader) has some features for helping you deal with character encodings in ruby 1.9. It is always recommended to supply an explicit :external_encoding option to MARC::Reader; either any valid ruby encoding, _or_ the string "MARC-8". MARC-8 input will by default be transcoded to a UTF-8 internal representation.
131
+ Much like one can [open a file and have it automatically close at the end
132
+ of a block](https://ruby-doc.org/core-2.5.0/File.html#method-c-open) in
133
+ standard ruby, the various writers will do the same.
51
134
 
52
- MARC::Reader does _not_ currently have any facilities for guessing encoding from MARC21 leader byte 9, that is
53
- ignored.
135
+ ```ruby
54
136
 
55
- Consult the MARC::Reader class docs for a more complete discussion and range of options.
137
+ # separate writer and #close
138
+ reader = MARC::Reader.new("my_marc.mrc")
139
+ writer = MARC::UnsafeXMLWriter.new("my_marc.xml")
140
+ reader.each do |record|
141
+ writer.write(record)
142
+ end
143
+ writer.close
56
144
 
57
- The MARC binary Writer (MARC::Writer) does not have any encoding-related features -- it's up to you the developer to make sure you create MARC::Records with consistent and expected char encodings, although MARC::Writer will write out a legal ISO 2709 either way, it just might have corrupted encodings.
145
+ # "self-closing" equivalent
146
+ reader = MARC::Reader.new("my_marc.mrc")
147
+ MARC::UnsafeXMLWriter.new("my_marc.xml") do |w|
148
+ reader.each do |record|
149
+ w.write(record)
150
+ end
151
+ end
152
+ # no need to close the writer here
153
+ ```
154
+
155
+ ### Serializing a single record
156
+
157
+ The `MARC::Record` class has utility functions to serialize to the various
158
+ formats. These are generally thin wrappers around the `encode` class
159
+ methods (e.g., `MARC::Writer.encode`, `MARC::XMLWriter.encode`, etc.)
160
+
161
+ * `record.to_marc` will production a marc21 binary string
162
+ * `record.to_json_string` returns a string containing the JSON document
163
+ for the marc-in-json serialization
164
+ * This just json-ifies `record.to_hash`, which returns a hash compatible
165
+ with the marc-in-json format.
166
+ * `record.to_xml_string` returns the actual XML string, with the following
167
+ options:
168
+ * `include_namespace: true` (default: `true`) will include the MARC namespace
169
+ attributes
170
+ * `fast_but_unsafe: true` (default: `false`) will use the much faster
171
+ `MARC::UnsafeXMLWriter` code, which produces the XML by string
172
+ concatenation. See that class for more information, but in general, if
173
+ your MARC isn't wildly invalid, it works fine and is roughly 15x faster.
174
+ The default (REXML) simply does `record.to_xml.to_s`
175
+
176
+ Note that * `record.to_xml`, for historical reasons, returns an REXML document of
177
+ the XML serialization and _not_ an XML string as one might expect.
178
+
179
+
180
+ ## Benchmarking reading MARC in various formats
181
+
182
+ A simple benchmark run on a single thread on a 2017-era x64 Macintosh
183
+ gives the numbers below.
184
+
185
+ ```
186
+ With mri 3.1.0 and jruby 9.3.6.0
187
+
188
+ Format Implementation Ruby r/sec x Slower compared to fastest
189
+ ===================================================================
190
+ jsonl stdlib JSON mri 6512 1.0
191
+ jsonl O j mri 6199 1.0
192
+ marc21 MARC::Reader mri 2889 2.3
193
+ marc-xml Nokogiri mri 1451 4.6
194
+ marc-xml REXML mri 239 28.0
195
+
196
+ marc21 MARC::Reader jruby 5455 1.2
197
+ jsonl stdlib JSON jruby 5437 1.2
198
+ marc-xml Nokogiri jruby 1631 4.1
199
+ marc-xml REXML jruby 253 26.5
200
+
201
+ ```
202
+
203
+ Note especially that if you're using MARC-XML, Nokogiri will read in
204
+ records 4-5 times faster.
205
+
206
+ ## Character Encoding issues
207
+
208
+ The Marc binary (ISO 2709) Reader (MARC::Reader) has some features for helping
209
+ you deal with character encodings in ruby 1.9. It is always recommended to
210
+ supply an explicit :external_encoding option to MARC::Reader; either any valid
211
+ ruby encoding, _or_ the string "MARC-8".
212
+ MARC-8 input will by default be transcoded to a UTF-8 internal representation.
213
+
214
+ MARC::Reader does _not_ currently have any facilities for guessing encoding
215
+ from MARC21 leader byte 9, that is ignored.
216
+
217
+ Consult the MARC::Reader class docs for a more complete discussion and range
218
+ of options.
219
+
220
+ The MARC binary Writer (MARC::Writer) does not have any encoding-related
221
+ features -- it's up to you the developer to make sure you create MARC::Records
222
+ with consistent and expected char encodings, although MARC::Writer will write
223
+ out a legal ISO 2709 either way, it just might have corrupted encodings.
58
224
 
59
225
  When parsing MARCXML _with Nokogiri as your XML parser implementation_ up to
60
- and including version `1.0.2` of this gem, if the XML was badly formed, parsing
61
- would stop and no error would be reported to your code.
226
+ and including version `1.0.2` of this gem, if the XML was badly formed,
227
+ parsing would stop and no error would be reported to your code.
62
228
 
63
229
  If you are using a version > `1.0.2` of `ruby-marc` with MRI + Nokogiri, XML
64
230
  syntax errors will be thrown (and you may need to adjust your code to account
@@ -67,17 +233,44 @@ using Nokogiri as an XML parser with JRuby as your ruby implementation, XML
67
233
  syntax errors will still be ignored unless you have Nokogiri version `1.10.2`
68
234
  or later.
69
235
 
70
- ## Miscellany
236
+ ## JRubySTAXReader caveats
237
+
238
+ NOTE: The JRubyStaxReader is deprecated. Nokogiri should be used instead.
239
+
240
+ - Under Java 9+, MARC::JRubySTAXReader requires adding the following
241
+ to `JAVA_OPTS`
242
+ in order to work
243
+ around [Java module system](https://openjdk.java.net/jeps/261)
244
+ restrictions:
245
+
246
+ ```sh
247
+ --add-opens java.xml/com.sun.org.apache.xerces.internal.impl=org.jruby.dist
248
+ ```
249
+
250
+ - MARC::JRubySTAXReader is deprecated and will be removed in a future version
251
+ of
252
+ `ruby-marc`. Please use MARC::JREXMLReader or MARC::NokogiriReader instead.
253
+
254
+ ## Miscellany
71
255
 
72
256
  Source code at: https://github.com/ruby-marc/ruby-marc/
73
257
 
74
258
  Find generated API docs at: http://rubydoc.info/gems/marc/frames
75
259
 
76
- Run automated tests in source with `rake test`.
260
+ Run automated tests in source with `rake test`.
261
+
262
+ Developers, release new version of gem to rubygems with `rake release`
263
+ (bundler-supplied task). Note that one nice thing this will do is
264
+ automatically tag the version in git, very important for later figuring out
265
+ what's going on.
77
266
 
78
- Developers, release new version of gem to rubygems with `rake release`
79
- (bundler-supplied task). Note that one nice thing this will do is automatically
80
- tag the version in git, very important for later figuring out what's going on.
267
+ ## Installation
268
+
269
+ gem install marc
270
+
271
+ Or if you're using bundler, add to your Gemfile
272
+
273
+ gem 'marc'
81
274
 
82
275
  ## Authors
83
276