marc 1.0.4 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +30 -0
- data/.github/workflows/ruby.yml +24 -0
- data/.gitignore +17 -0
- data/.standard.yml +1 -0
- data/{Changes → CHANGELOG.md} +106 -29
- data/Gemfile +15 -0
- data/README.md +240 -47
- data/Rakefile +14 -14
- data/bin/marc +14 -0
- data/bin/marc2xml +17 -0
- data/examples/xml2marc.rb +10 -0
- data/lib/marc/constants.rb +3 -3
- data/lib/marc/controlfield.rb +35 -23
- data/lib/marc/datafield.rb +70 -63
- data/lib/marc/dublincore.rb +59 -41
- data/lib/marc/exception.rb +9 -1
- data/lib/marc/jsonl_reader.rb +33 -0
- data/lib/marc/jsonl_writer.rb +44 -0
- data/lib/marc/marc8/map_to_unicode.rb +16417 -16420
- data/lib/marc/marc8/to_unicode.rb +80 -86
- data/lib/marc/reader.rb +119 -121
- data/lib/marc/record.rb +72 -62
- data/lib/marc/subfield.rb +12 -10
- data/lib/marc/unsafe_xmlwriter.rb +93 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/writer.rb +27 -30
- data/lib/marc/xml_parsers.rb +222 -197
- data/lib/marc/xmlreader.rb +131 -114
- data/lib/marc/xmlwriter.rb +93 -81
- data/lib/marc.rb +20 -18
- data/marc.gemspec +23 -0
- data/test/marc8/tc_marc8_mapping.rb +3 -3
- data/test/marc8/tc_to_unicode.rb +28 -32
- data/test/messed_up_leader.xml +9 -0
- data/test/tc_controlfield.rb +37 -34
- data/test/tc_datafield.rb +65 -60
- data/test/tc_dublincore.rb +9 -11
- data/test/tc_hash.rb +10 -13
- data/test/tc_jsonl.rb +19 -0
- data/test/tc_marchash.rb +17 -21
- data/test/tc_parsers.rb +108 -144
- data/test/tc_reader.rb +35 -36
- data/test/tc_reader_char_encodings.rb +149 -169
- data/test/tc_record.rb +143 -148
- data/test/tc_subfield.rb +14 -13
- data/test/tc_unsafe_xml.rb +95 -0
- data/test/tc_writer.rb +101 -108
- data/test/tc_xml.rb +99 -87
- data/test/tc_xml_error_handling.rb +7 -8
- data/test/ts_marc.rb +8 -8
- metadata +94 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78a543b07dbaa8d6aeff40421038d6d24649bc1769db2f286017e9205428c086
|
4
|
+
data.tar.gz: ed2b006b3f4c32ec718ede1a5d56435311970144a8208821733c6089e82bfea7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a53a6fafee92dad72b644668a5b1adabcd0cdb1af6b2ed87d7f8c97d5e56415bcf1e990f113cb5f459807e792e2ac5846948e116534dbfdfd31061c1a44f905
|
7
|
+
data.tar.gz: 4b36af95c908ac282efe7f05f1205b66f8c6c6f62c9f750843012512360c5695af15c5bb23759ccad2703d6c0304540915c45ae3259068fe8a7cb1497f1237f6
|
@@ -0,0 +1,30 @@
|
|
1
|
+
---
|
2
|
+
name: Bug report
|
3
|
+
about: Create a report to help us improve
|
4
|
+
title: ''
|
5
|
+
labels: ''
|
6
|
+
assignees: ''
|
7
|
+
|
8
|
+
---
|
9
|
+
|
10
|
+
**Describe the bug**
|
11
|
+
A clear and concise description of what the bug is.
|
12
|
+
|
13
|
+
**To Reproduce**
|
14
|
+
Steps to reproduce the behavior:
|
15
|
+
1. Include, if possible, a sample file that exhibits the behavior
|
16
|
+
2. Include minimal but relevant ruby-marc code that exhibits the behavor
|
17
|
+
|
18
|
+
**Expected behavior**
|
19
|
+
A clear and concise description of what you expected to happen.
|
20
|
+
|
21
|
+
**Program Output**
|
22
|
+
If applicable, add program output and/or backtraces
|
23
|
+
|
24
|
+
**Environment (please complete the following information):**
|
25
|
+
- ruby-marc version (from `MARC::VERSION`)
|
26
|
+
- ruby runtime and version (best: the output of `ruby -e 'puts RUBY_DESCRIPTION'`)
|
27
|
+
- operating system, if not included in output of `RUBY_DESCRIPTION`
|
28
|
+
|
29
|
+
**Additional context**
|
30
|
+
Add any other context about the problem here.
|
@@ -0,0 +1,24 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
env:
|
6
|
+
# See https://github.com/jruby/jruby/issues/5509
|
7
|
+
JAVA_OPTS: "--add-opens java.xml/com.sun.org.apache.xerces.internal.impl=org.jruby.dist"
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
tests:
|
11
|
+
runs-on: ubuntu-latest
|
12
|
+
strategy:
|
13
|
+
matrix:
|
14
|
+
ruby: [2.5, 2.6, 2.7, 3.0, jruby]
|
15
|
+
steps:
|
16
|
+
- uses: actions/checkout@v2
|
17
|
+
- name: Set up Ruby
|
18
|
+
uses: ruby/setup-ruby@v1
|
19
|
+
with:
|
20
|
+
ruby-version: ${{ matrix.ruby }}
|
21
|
+
- name: Install dependencies
|
22
|
+
run: bundle install --without documentation
|
23
|
+
- name: Run tests
|
24
|
+
run: bundle exec rake
|
data/.gitignore
ADDED
data/.standard.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ruby_version: 2.3
|
data/{Changes → CHANGELOG.md}
RENAMED
@@ -1,14 +1,92 @@
|
|
1
|
-
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
## [1.2] - 2022-08-02
|
6
|
+
|
7
|
+
### Added
|
8
|
+
|
9
|
+
* New XML writer `MARC::UnsafeXMLWriter` which is 15-20 times faster than the
|
10
|
+
default (rexml-based) writer. It mirrors code from the old
|
11
|
+
[`MARC::FastXMLWriter` gem](https://github.com/billdueber/marc-fastxmlwriter)
|
12
|
+
in a way that integrates better with the existing writer framework. It can
|
13
|
+
be used like any other writer,
|
14
|
+
e.g., `writer = MARC::UnsafeXMLWriter. new(filename)`. Note that while it
|
15
|
+
is "unsafe" in that it doesn't do checks for valid XML going out (it's speed
|
16
|
+
comes from the fact that it's just concatenating strings together),
|
17
|
+
the `FastXMLWriter` gem has been used "in the wild" for years and doesn't
|
18
|
+
seem to cause anyone any problems.
|
19
|
+
* Added a new method, `MARC::Record.to_xml_string` which produces a
|
20
|
+
valid `<record>...</record>` XML snippet. It takes an optional keyword
|
21
|
+
argument to include namespace attributes on the
|
22
|
+
`<record>` tag, and another to use the new unsafe generator as
|
23
|
+
`record.to_xml_string(fast_but_unsafe: true)`.
|
24
|
+
* Added first-class support for `.jsonl` (aka "newline-delimited json")
|
25
|
+
files using the marc-in-json format via `MARC::JSONLReader` and
|
26
|
+
`MARC::JSONLWriter` which read and write marc-in-json. `ruby-marc` has
|
27
|
+
supported `#to_hash` and `#from_hash` to deal with this format at the
|
28
|
+
individual record level for a long time; this just provides the
|
29
|
+
reader/writer scaffolding.
|
30
|
+
* Also added `MARC::Record.to_json_string` to get a marc-in-json string
|
31
|
+
representation (parallel to the new `#to_xml_string`)
|
32
|
+
* New option to xml readers to ignore any namespaces
|
33
|
+
via `reader = MARC::XMLReader.new(filename, ignore_namespace: true)`. While
|
34
|
+
the REXML MARC-XML reader can't handle
|
35
|
+
(and thus has always ignored XML namespaces), the Nokogiri-based version
|
36
|
+
will enforce namespaces if present. Useful only when you have
|
37
|
+
poorly-generated files where the XML namespace attributes are wonky.
|
38
|
+
* All writers will now self-close if used with a block (e.g.,
|
39
|
+
`MARC::Writer.new(filename) {|w| w.write(record)}`), parallel to the way
|
40
|
+
`File.open` works in regular ruby.
|
41
|
+
* XML writers will now take an optional keyword argument,
|
42
|
+
`include_namespace`, on both `#new` and `.encode`.
|
43
|
+
|
44
|
+
### Changed
|
45
|
+
* Remove the `JREXML` parser, which apparently hasn't worked for years yet
|
46
|
+
also wasn't running in CI because the test are running under bundler,
|
47
|
+
which didn't load `jrexml`. Set to emit a warning to use nokogiri
|
48
|
+
instead and fall back to REXML.
|
49
|
+
* 10-15% speed improvement when parsing MARC-XML with nokogiri (PR #97,
|
50
|
+
billdueber)
|
51
|
+
* Added deprecation warnings when using the `libxml`, `jstax`, or `jrexml`
|
52
|
+
xml parsers. When introduced, Nokogiri under JRuby was iffy. It's now
|
53
|
+
stable on both MRI and JRuby and faster than any of the other
|
54
|
+
included options and should be preferred. (PR #98, billdueber)
|
55
|
+
* MARC fields are now validated in their own post-creation stage (PR #66,
|
56
|
+
cbeer)
|
57
|
+
* Reduce the noise when running tests (billdueber)
|
58
|
+
* Reformatted this CHANGELOG.md file and added examples/structure to
|
59
|
+
README.md.
|
60
|
+
|
61
|
+
|
62
|
+
### Fixed
|
63
|
+
* MARC-XML has requirements on the leader that are applied when writing out
|
64
|
+
MARC-XML by `MARC::XMLWriter.encode`. Previous versions would actually
|
65
|
+
mutate the record being written, resulting in a silent modification to
|
66
|
+
a record just because you were writing it out. Changed to use a duplicate
|
67
|
+
(PR #73, cbeer)
|
68
|
+
* Guard against multiple character calls when parsing XML (PR #74, cbeer)
|
69
|
+
* Minor Dublin Core code fixes (PRs #83 and #84, fjorba)
|
70
|
+
* `JRubyStaxReader` now supports Java 9+ / JRuby 9.3+ (PR #87, dmolesUC)
|
71
|
+
|
72
|
+
## [1.1.1] - 2021-06-07
|
73
|
+
|
74
|
+
- Fix a regression when normalizing indicator values when serializing marcxml
|
75
|
+
|
76
|
+
## [1.1.0] - 2021-06-01
|
77
|
+
- Add support for additional valid subfield codes in marcxml
|
78
|
+
|
79
|
+
## [1.0.2] - 2017-08-01
|
2
80
|
- Now (correctly) throw an error if datafield string is the empty string
|
3
81
|
(thanks to @bibliotechy)
|
4
82
|
|
5
|
-
|
83
|
+
## [1.0.1] - 2016-02-29
|
6
84
|
- Non-user-facing change in implementation of FieldMap strictly for performance
|
7
85
|
|
8
|
-
|
86
|
+
## [1.0.0] - 2015-01-28
|
9
87
|
- Mostly changes that deal with encoding, plus the plunge to a 1.0 release
|
10
88
|
|
11
|
-
|
89
|
+
## [0.5.0] April 2012
|
12
90
|
- Extensive rewrite of MARC::Reader (ISO 2709 binary reader) to provide a
|
13
91
|
fairly complete and consistent handing of char encoding issues in ruby 1.9.
|
14
92
|
- This code is well covered by automated tests, but ends up complex, there
|
@@ -21,75 +99,75 @@ v0.5.0 April 2012
|
|
21
99
|
non-unicode encodings to UTF-8 for you. This version will not do
|
22
100
|
so unless you ask it to with correct arguments.
|
23
101
|
|
24
|
-
|
102
|
+
## [0.4.4] Sat Mar 03 14:55:00 EDT 2012
|
25
103
|
- Fixed performance regression: strict reader will parse about 5x faster now
|
26
104
|
- Updated CHANGES file for first time in a long time :-)
|
27
105
|
|
28
|
-
|
106
|
+
## [0.3.0] Wed Sep 23 21:51:00 EDT 2009
|
29
107
|
- Nokogiri and jrexml parser integration added as well as Ruby 1.9 support
|
30
108
|
|
31
|
-
|
109
|
+
## [0.2.2] Tue Dec 30 09:50:33 EST 2008
|
32
110
|
- DataField tags that are all numeric are now padded with leading zeros
|
33
111
|
|
34
|
-
|
112
|
+
## [0.2.1] Mon Aug 18 14:14:16 EDT 2008
|
35
113
|
- can now process records that have fields tags that are non-numeric (thanks
|
36
114
|
Ross Singer)
|
37
115
|
|
38
|
-
|
116
|
+
## [0.2.0] Wed Jun 11 12:42:20 EDT 2008
|
39
117
|
- added newline to output generated by REXML::Formatters::Default to make
|
40
118
|
it a bit more friendly. REXML::Formatters::Pretty and Transitive just
|
41
119
|
don't do what I want (whitespace in weird places).
|
42
120
|
|
43
|
-
|
121
|
+
## [0.1.9] - Thu Jun 5 12:00:01 EDT 2008
|
44
122
|
- small docfix change in XMLReader
|
45
123
|
- use REXML::Formatters::Default instead of deprecated REXML::Element.write
|
46
124
|
|
47
|
-
|
125
|
+
## [0.1.8] - Tue Nov 13 22:51:03 EST 2007
|
48
126
|
- added examples directory
|
49
127
|
- fixed problem with leading whitespace and the leader in xml reader
|
50
128
|
(thanks Morgan Cundiff)
|
51
129
|
|
52
|
-
|
130
|
+
## [0.1.7] - Mon Nov 12 09:33:57 EST 2007
|
53
131
|
- updated Record.to_marc documentation to be a bit more precise
|
54
132
|
- removed doc references to MARC::Field which is no longer around
|
55
133
|
- changed from Artistic to MIT License
|
56
134
|
|
57
|
-
|
135
|
+
## [0.1.6] - Fri May 4 12:37:33 EDT 2007
|
58
136
|
- fixed bad record length test
|
59
137
|
- removed MARC::XMLWriter convert_to_utf8 which wasn't really working and
|
60
138
|
shouldn't be there if it isn't good
|
61
139
|
- added unescaping of entities to MARC::XMLReader
|
62
140
|
|
63
|
-
|
141
|
+
## [0.1.5] - Tue May 1 16:50:02 EDT 2007
|
64
142
|
- docfix in MARC::DataField (thanks Jason Ronallo)
|
65
143
|
- multiple docfixes (thanks Jonathan Rochkind)
|
66
144
|
|
67
|
-
|
145
|
+
## [0.1.4] - Tue Jan 2 15:45:53 EST 2007
|
68
146
|
- fixed bug in MARC::XMLWriter that was outputting all control field tags as 00z
|
69
147
|
(thanks Ross Singer)
|
70
148
|
- added :include_namespace option to MARC::XMLWriter::encode to include the
|
71
149
|
marcxml namespace, which allows MARC::Record::to_xml to emit the namespace
|
72
150
|
for a single record.
|
73
151
|
|
74
|
-
|
152
|
+
## [0.1.3] - Tue Jan 2 12:56:36 EST 2007
|
75
153
|
- added ability to map a MARC record to the Dublin Core fields. Calling
|
76
154
|
to_dublin_core on a MARC::Record returns a hash that has Dublin Core fields
|
77
155
|
as the hash keys.
|
78
156
|
|
79
|
-
|
157
|
+
## [0.1.2] - Thu Dec 21 18:46:01 EST 2007
|
80
158
|
- fixed MARC::Record::to_xml so that it actually is tested and works (thanks
|
81
159
|
Ross Singer)
|
82
160
|
|
83
|
-
|
161
|
+
## [0.1.1] -
|
84
162
|
- added ability to pass File like objects to the constructor for
|
85
163
|
MARC::XMLReader like MARC::Reader (thanks Jake Glenn)
|
86
164
|
|
87
|
-
|
165
|
+
## [0.1.0] - Wed Dec 6 15:40:40 EST 2006
|
88
166
|
- fixed pretty xml when stylesheet is used
|
89
167
|
- added value() to MARC::DataField
|
90
168
|
- added Rakefile for testing/building
|
91
169
|
|
92
|
-
|
170
|
+
## [0.0.9] - Tue Mar 28 10:02:16 CST 2006
|
93
171
|
- changed XMLWriter.write to output pretty-printed XML
|
94
172
|
- normalized Text in XML output
|
95
173
|
- added XMLWriter checks and replacements for bad subfield codes and indicator
|
@@ -102,7 +180,7 @@ v0.0.9 Tue Mar 28 10:02:16 CST 2006
|
|
102
180
|
test.
|
103
181
|
- added :stylesheet argument to XLMWriter.new
|
104
182
|
|
105
|
-
|
183
|
+
## [0.0.8] - Mon Jan 16 22:31:00 EST 2006
|
106
184
|
- removed control tests out of tc_field.rb into tc_control.rb
|
107
185
|
- fixed some formatting
|
108
186
|
- changed control/field to controlfield/datafield
|
@@ -114,7 +192,7 @@ v0.0.8 Mon Jan 16 22:31:00 EST 2006
|
|
114
192
|
- fixed xmlreader strip_ns which was rerturning Nil when no namespace
|
115
193
|
was found on an element (exposed by namespace changes).
|
116
194
|
|
117
|
-
|
195
|
+
## [0.0.7] - Mon Jan 2 21:39:28 CST 2006
|
118
196
|
- MARC::XMLWriter added
|
119
197
|
- removed encode/decode methods in MARC::MARC21 into MARC::Writer and
|
120
198
|
MARC::Reader respectively. This required pushing MARC21 specific constants
|
@@ -125,26 +203,25 @@ v0.0.7 Mon Jan 2 21:39:28 CST 2006
|
|
125
203
|
- added xml reading tests
|
126
204
|
- fixed indentation to be two spaces
|
127
205
|
|
128
|
-
|
206
|
+
## [0.0.6] - Tue Oct 18 09:33:12 CDT 2005
|
129
207
|
- MARC::MARC21::decode throws an exception when a directory can't be found.
|
130
208
|
Exception is caught and ignored in MARC::ForgivingReader
|
131
209
|
|
132
|
-
|
210
|
+
## [0.0.5] - Tue Oct 18 01:50:40 CDT 2005
|
133
211
|
- when unspecified field indicators are forced to blanks
|
134
212
|
- checking for when a field appears to not have indicators and subfields in
|
135
213
|
which case the field is skipped entirely
|
136
214
|
|
137
|
-
|
215
|
+
## [0.0.4] - Tue Oct 18 00:39:50 CDT 2005
|
138
216
|
- fixed off by one error when reading in leader, previous versions were
|
139
217
|
reading an extra character
|
140
218
|
|
141
|
-
|
219
|
+
## [0.0.3] - Mon Oct 17 22:51:23 CDT 2005
|
142
220
|
- added ForgivingReader class and support for reading records without using
|
143
221
|
possibly faulty offsets when the user needs them.
|
144
222
|
|
145
|
-
|
223
|
+
## [0.0.2] - Mon Oct 17 17:42:57 CDT 2005
|
146
224
|
- updated version string to see if it'll fix some gem oddness
|
147
225
|
|
148
|
-
|
226
|
+
## [0.0.1] - Mon Oct 10 10:29:20 CDT 2005
|
149
227
|
- initial release
|
150
|
-
|
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "https://rubygems.org"
|
2
|
+
|
3
|
+
group :test do
|
4
|
+
if RUBY_VERSION != "1.8.7"
|
5
|
+
gem "nokogiri"
|
6
|
+
end
|
7
|
+
gem "rake"
|
8
|
+
gem "rdoc"
|
9
|
+
gem "xml-simple"
|
10
|
+
gem "test-unit"
|
11
|
+
gem "warning"
|
12
|
+
end
|
13
|
+
|
14
|
+
# Specify your gem's dependencies in ..gemspec
|
15
|
+
gemspec
|
data/README.md
CHANGED
@@ -1,64 +1,230 @@
|
|
1
1
|
[](http://badge.fury.io/rb/marc)
|
2
|
-
|
2
|
+

|
3
|
+
|
|
3
4
|
|
4
5
|
marc is a ruby library for reading and writing MAchine Readable Cataloging
|
5
6
|
(MARC). More information about MARC can be found at <http://www.loc.gov/marc>.
|
6
7
|
|
7
|
-
## Usage
|
8
|
+
## Usage
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
10
|
+
### Basics
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
|
14
|
+
reader = MARC::Reader.new("myfile.mrc")
|
15
|
+
reader.each do |record|
|
16
|
+
first_245 = record["245"] #=> #<MARC::DataField...>
|
17
|
+
first_245.to_s #=> "245 04 $a The Texas ranger $h [sound recording] / $c Sung by Beale D. Taylor. "
|
18
|
+
first_245.value #=> "The Texas ranger[sound recording] /Sung by Beale D. Taylor."
|
19
|
+
first_245.codes #=> ["a", "h", "c"]
|
20
|
+
first_245["a"] #=> "The Texas ranger"
|
17
21
|
|
18
|
-
|
19
|
-
|
20
|
-
record.append(MARC::DataField.new('100', '0', ' ', ['a', 'John Doe']))
|
22
|
+
# A record is an enumerable over its fields and thus can use things like
|
23
|
+
# #each, #select, #find, etc.
|
21
24
|
|
22
|
-
|
23
|
-
writer = MARC::Writer.new('marc.dat')
|
24
|
-
writer.write(record)
|
25
|
-
writer.close()
|
25
|
+
subject_fields = record.select{|f| f.tag =~ /\A6/}
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
MARC::Writer.encode(record) # or record.to_marc
|
34
|
-
|
35
|
-
MARC::Record provides `#to_hash` and `#from_hash` implementations that deal in ruby
|
36
|
-
hash's that are compatible with the
|
37
|
-
[marc-in-json](http://dilettantes.code4lib.org/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
|
38
|
-
serialization format. You are responsible for serializing the hash to/from JSON yourself.
|
27
|
+
# Get author fields by supplying a list of tags
|
28
|
+
record.fields.each_by_tag(["100", "110", "111"]) do |field|
|
29
|
+
puts field.value
|
30
|
+
end
|
31
|
+
end
|
32
|
+
```
|
39
33
|
|
40
|
-
## Installation
|
41
34
|
|
42
|
-
|
35
|
+
### Reading / Writing MARC21 binary data
|
43
36
|
|
44
|
-
|
37
|
+
```ruby
|
38
|
+
require 'marc'
|
45
39
|
|
46
|
-
|
47
|
-
|
48
|
-
|
40
|
+
# marc21 binary format uses MARC::Reader and MARC::Writer
|
41
|
+
|
42
|
+
reader = MARC::Reader.new('marc.dat')
|
43
|
+
reader.each do |record|
|
44
|
+
title = record["245"].value
|
45
|
+
puts title
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
If you know you have another encoding, you can specify it
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
reader = MARC::Reader.new("marc.dat", external_encoding: "MARC-8")
|
53
|
+
```
|
54
|
+
|
55
|
+
While generally used with files, you can also give a reader an IO object
|
56
|
+
(usually an already-opened file or a StringIO object)
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
marc_data = File.open("marc.dat")
|
60
|
+
reader = MARC::Reader.new(marc_data)
|
61
|
+
```
|
62
|
+
|
63
|
+
Similarly, you can write to either a file or an IO-like object
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
writer = MARC::Writer.new("myfile.dat")
|
67
|
+
# writer = MARC::Writer.new(Zlib::GzipWriter.open("myfile.dat.gz"))
|
68
|
+
|
69
|
+
myrecords.each do |rec|
|
70
|
+
writer.write(rec)
|
71
|
+
end
|
72
|
+
writer.close
|
73
|
+
```
|
74
|
+
|
75
|
+
### Reading/Writing marc-in-json
|
76
|
+
|
77
|
+
[marc-in-json](https://rossfsinger.com/blog/2010/09/a-proposal-to-serialize-marc-in-json/)
|
78
|
+
is a simple hash-based serialization format for MARC, often used with the
|
79
|
+
[jsonl](https://jsonlines.org/) (aka jsonlines or newline-delimited-json)
|
80
|
+
file format which puts one json structure on each line.
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
|
84
|
+
reader = MARC::JSONLReader.new("myfile.jsonl")
|
85
|
+
writer = MARC::JSONLWriter.new("my_other_file.jsonl")
|
86
|
+
reader.each do |record|
|
87
|
+
writer.write(record)
|
88
|
+
end
|
89
|
+
writer.close
|
90
|
+
|
91
|
+
```
|
92
|
+
|
93
|
+
### Reading/Writing MARC-XML
|
94
|
+
|
95
|
+
MARC-XML is an XML-based serialiation format for MARC records. It is,
|
96
|
+
generally speaking, a lot slower than using MARC21 or marc-in-json.
|
97
|
+
|
98
|
+
There are two XML parsers supported going forwards within the ruby-marc code
|
99
|
+
base: REXML (the first, and for a long time only, ruby XML parser based on
|
100
|
+
regular expressions) and Nokogiri. Both are compatible with both MRI ("normal") ruby and JRuby.
|
101
|
+
|
102
|
+
The Nokogiri parser is about 6x faster than using REXML. See performance
|
103
|
+
numbers, below.
|
104
|
+
|
105
|
+
At one time, it was difficult to install Nokogiri under MRI and impossible
|
106
|
+
under JRuby. Because of this historical blip, nokogiri is _not_
|
107
|
+
automatically included when doing `require "marc"` in your code. If you want
|
108
|
+
to use the Nokogiri-based parser, you must include it explicitly.
|
109
|
+
|
110
|
+
```ruby
|
111
|
+
require "nokogiri"
|
112
|
+
require "marc"
|
113
|
+
|
114
|
+
reader = MARC::XMLReader.new("myfile.xml", parser: "nokogiri")
|
115
|
+
```
|
116
|
+
|
117
|
+
The `parser` argument works as follows:
|
118
|
+
|
119
|
+
* if not included, REXML is used
|
120
|
+
* if "rexml" or "nokogiri", the appropriate parser will be used
|
121
|
+
* if "magic", the Nokogiri parser will be used if Nokogiri has been loaded;
|
122
|
+
otherwise it will fall back to using REXML.
|
123
|
+
|
124
|
+
```ruby
|
125
|
+
# Use the best available
|
126
|
+
reader = MARC::XMLReader.new("my_file.xml", parser: "magic")
|
127
|
+
```
|
128
|
+
|
129
|
+
### "Self-closing" writers
|
49
130
|
|
50
|
-
|
131
|
+
Much like one can [open a file and have it automatically close at the end
|
132
|
+
of a block](https://ruby-doc.org/core-2.5.0/File.html#method-c-open) in
|
133
|
+
standard ruby, the various writers will do the same.
|
51
134
|
|
52
|
-
|
53
|
-
ignored.
|
135
|
+
```ruby
|
54
136
|
|
55
|
-
|
137
|
+
# separate writer and #close
|
138
|
+
reader = MARC::Reader.new("my_marc.mrc")
|
139
|
+
writer = MARC::UnsafeXMLWriter.new("my_marc.xml")
|
140
|
+
reader.each do |record|
|
141
|
+
writer.write(record)
|
142
|
+
end
|
143
|
+
writer.close
|
56
144
|
|
57
|
-
|
145
|
+
# "self-closing" equivalent
|
146
|
+
reader = MARC::Reader.new("my_marc.mrc")
|
147
|
+
MARC::UnsafeXMLWriter.new("my_marc.xml") do |w|
|
148
|
+
reader.each do |record|
|
149
|
+
w.write(record)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
# no need to close the writer here
|
153
|
+
```
|
154
|
+
|
155
|
+
### Serializing a single record
|
156
|
+
|
157
|
+
The `MARC::Record` class has utility functions to serialize to the various
|
158
|
+
formats. These are generally thin wrappers around the `encode` class
|
159
|
+
methods (e.g., `MARC::Writer.encode`, `MARC::XMLWriter.encode`, etc.)
|
160
|
+
|
161
|
+
* `record.to_marc` will production a marc21 binary string
|
162
|
+
* `record.to_json_string` returns a string containing the JSON document
|
163
|
+
for the marc-in-json serialization
|
164
|
+
* This just json-ifies `record.to_hash`, which returns a hash compatible
|
165
|
+
with the marc-in-json format.
|
166
|
+
* `record.to_xml_string` returns the actual XML string, with the following
|
167
|
+
options:
|
168
|
+
* `include_namespace: true` (default: `true`) will include the MARC namespace
|
169
|
+
attributes
|
170
|
+
* `fast_but_unsafe: true` (default: `false`) will use the much faster
|
171
|
+
`MARC::UnsafeXMLWriter` code, which produces the XML by string
|
172
|
+
concatenation. See that class for more information, but in general, if
|
173
|
+
your MARC isn't wildly invalid, it works fine and is roughly 15x faster.
|
174
|
+
The default (REXML) simply does `record.to_xml.to_s`
|
175
|
+
|
176
|
+
Note that * `record.to_xml`, for historical reasons, returns an REXML document of
|
177
|
+
the XML serialization and _not_ an XML string as one might expect.
|
178
|
+
|
179
|
+
|
180
|
+
## Benchmarking reading MARC in various formats
|
181
|
+
|
182
|
+
A simple benchmark run on a single thread on a 2017-era x64 Macintosh
|
183
|
+
gives the numbers below.
|
184
|
+
|
185
|
+
```
|
186
|
+
With mri 3.1.0 and jruby 9.3.6.0
|
187
|
+
|
188
|
+
Format Implementation Ruby r/sec x Slower compared to fastest
|
189
|
+
===================================================================
|
190
|
+
jsonl stdlib JSON mri 6512 1.0
|
191
|
+
jsonl O j mri 6199 1.0
|
192
|
+
marc21 MARC::Reader mri 2889 2.3
|
193
|
+
marc-xml Nokogiri mri 1451 4.6
|
194
|
+
marc-xml REXML mri 239 28.0
|
195
|
+
|
196
|
+
marc21 MARC::Reader jruby 5455 1.2
|
197
|
+
jsonl stdlib JSON jruby 5437 1.2
|
198
|
+
marc-xml Nokogiri jruby 1631 4.1
|
199
|
+
marc-xml REXML jruby 253 26.5
|
200
|
+
|
201
|
+
```
|
202
|
+
|
203
|
+
Note especially that if you're using MARC-XML, Nokogiri will read in
|
204
|
+
records 4-5 times faster.
|
205
|
+
|
206
|
+
## Character Encoding issues
|
207
|
+
|
208
|
+
The Marc binary (ISO 2709) Reader (MARC::Reader) has some features for helping
|
209
|
+
you deal with character encodings in ruby 1.9. It is always recommended to
|
210
|
+
supply an explicit :external_encoding option to MARC::Reader; either any valid
|
211
|
+
ruby encoding, _or_ the string "MARC-8".
|
212
|
+
MARC-8 input will by default be transcoded to a UTF-8 internal representation.
|
213
|
+
|
214
|
+
MARC::Reader does _not_ currently have any facilities for guessing encoding
|
215
|
+
from MARC21 leader byte 9, that is ignored.
|
216
|
+
|
217
|
+
Consult the MARC::Reader class docs for a more complete discussion and range
|
218
|
+
of options.
|
219
|
+
|
220
|
+
The MARC binary Writer (MARC::Writer) does not have any encoding-related
|
221
|
+
features -- it's up to you the developer to make sure you create MARC::Records
|
222
|
+
with consistent and expected char encodings, although MARC::Writer will write
|
223
|
+
out a legal ISO 2709 either way, it just might have corrupted encodings.
|
58
224
|
|
59
225
|
When parsing MARCXML _with Nokogiri as your XML parser implementation_ up to
|
60
|
-
and including version `1.0.2` of this gem, if the XML was badly formed,
|
61
|
-
would stop and no error would be reported to your code.
|
226
|
+
and including version `1.0.2` of this gem, if the XML was badly formed,
|
227
|
+
parsing would stop and no error would be reported to your code.
|
62
228
|
|
63
229
|
If you are using a version > `1.0.2` of `ruby-marc` with MRI + Nokogiri, XML
|
64
230
|
syntax errors will be thrown (and you may need to adjust your code to account
|
@@ -67,17 +233,44 @@ using Nokogiri as an XML parser with JRuby as your ruby implementation, XML
|
|
67
233
|
syntax errors will still be ignored unless you have Nokogiri version `1.10.2`
|
68
234
|
or later.
|
69
235
|
|
70
|
-
##
|
236
|
+
## JRubySTAXReader caveats
|
237
|
+
|
238
|
+
NOTE: The JRubyStaxReader is deprecated. Nokogiri should be used instead.
|
239
|
+
|
240
|
+
- Under Java 9+, MARC::JRubySTAXReader requires adding the following
|
241
|
+
to `JAVA_OPTS`
|
242
|
+
in order to work
|
243
|
+
around [Java module system](https://openjdk.java.net/jeps/261)
|
244
|
+
restrictions:
|
245
|
+
|
246
|
+
```sh
|
247
|
+
--add-opens java.xml/com.sun.org.apache.xerces.internal.impl=org.jruby.dist
|
248
|
+
```
|
249
|
+
|
250
|
+
- MARC::JRubySTAXReader is deprecated and will be removed in a future version
|
251
|
+
of
|
252
|
+
`ruby-marc`. Please use MARC::JREXMLReader or MARC::NokogiriReader instead.
|
253
|
+
|
254
|
+
## Miscellany
|
71
255
|
|
72
256
|
Source code at: https://github.com/ruby-marc/ruby-marc/
|
73
257
|
|
74
258
|
Find generated API docs at: http://rubydoc.info/gems/marc/frames
|
75
259
|
|
76
|
-
Run automated tests in source with `rake test`.
|
260
|
+
Run automated tests in source with `rake test`.
|
261
|
+
|
262
|
+
Developers, release new version of gem to rubygems with `rake release`
|
263
|
+
(bundler-supplied task). Note that one nice thing this will do is
|
264
|
+
automatically tag the version in git, very important for later figuring out
|
265
|
+
what's going on.
|
77
266
|
|
78
|
-
|
79
|
-
|
80
|
-
|
267
|
+
## Installation
|
268
|
+
|
269
|
+
gem install marc
|
270
|
+
|
271
|
+
Or if you're using bundler, add to your Gemfile
|
272
|
+
|
273
|
+
gem 'marc'
|
81
274
|
|
82
275
|
## Authors
|
83
276
|
|