marc 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/marc/reader.rb +4 -0
- data/lib/marc/version.rb +1 -1
- data/lib/marc/xml_parsers.rb +12 -9
- data/lib/marc/xmlreader.rb +19 -13
- data/test/tc_parsers.rb +11 -9
- data/test/tc_xml.rb +6 -4
- metadata +51 -77
- data/test/bare_cp866.txt +0 -1
- data/test/jruby_bad_transcode.rb +0 -52
- data/test/jruby_just_string.rb +0 -39
- data/test/tc_bare_ruby_strings.rb +0 -43
- data/test/test_cp866.txt +0 -1
data/lib/marc/reader.rb
CHANGED
@@ -215,6 +215,10 @@ module MARC
|
|
215
215
|
# declared on the string passed in.
|
216
216
|
params[:external_encoding] = marc.encoding
|
217
217
|
end
|
218
|
+
# And now that we've recorded the current encoding, we force
|
219
|
+
# to binary encoding, because we're going to be doing byte arithmetic,
|
220
|
+
# and want to avoid byte-vs-char confusion.
|
221
|
+
marc.force_encoding("binary") if marc.respond_to?(:force_encoding)
|
218
222
|
|
219
223
|
record = Record.new()
|
220
224
|
record.leader = marc[0..LEADER_LENGTH-1]
|
data/lib/marc/version.rb
CHANGED
data/lib/marc/xml_parsers.rb
CHANGED
@@ -112,7 +112,7 @@ module MARC
|
|
112
112
|
|
113
113
|
def method_missing(methName, *args)
|
114
114
|
sax_methods = [:xmldecl, :start_document, :end_document, :start_element,
|
115
|
-
:end_element, :comment, :warning, :error, :cdata_block]
|
115
|
+
:end_element, :comment, :warning, :error, :cdata_block, :processing_instruction]
|
116
116
|
unless sax_methods.index(methName)
|
117
117
|
raise NoMethodError.new("undefined method '#{methName} for #{self}", 'no_meth')
|
118
118
|
end
|
@@ -293,7 +293,11 @@ module MARC
|
|
293
293
|
end
|
294
294
|
end
|
295
295
|
|
296
|
-
|
296
|
+
|
297
|
+
|
298
|
+
|
299
|
+
unless defined? JRUBY_VERSION
|
300
|
+
module LibXMLReader
|
297
301
|
|
298
302
|
def self.extended(receiver)
|
299
303
|
require 'xml'
|
@@ -344,6 +348,7 @@ module MARC
|
|
344
348
|
return r
|
345
349
|
end
|
346
350
|
end
|
351
|
+
end
|
347
352
|
|
348
353
|
# The JrubySTAXReader uses native java calls to parse the incoming stream
|
349
354
|
# of marc-xml. It includes most of the work from GenericPullParser
|
@@ -352,9 +357,7 @@ module MARC
|
|
352
357
|
module JRubySTAXReader
|
353
358
|
include GenericPullParser
|
354
359
|
def self.extended(receiver)
|
355
|
-
|
356
|
-
java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
|
357
|
-
include javax.xml.stream
|
360
|
+
require 'java' # may only be neccesary in jruby 1.6
|
358
361
|
receiver.init
|
359
362
|
end
|
360
363
|
|
@@ -373,13 +376,13 @@ module MARC
|
|
373
376
|
end
|
374
377
|
|
375
378
|
def parser_dispatch
|
376
|
-
while event = @parser.next and event != XMLStreamConstants.END_DOCUMENT do
|
379
|
+
while event = @parser.next and event != javax.xml.stream.XMLStreamConstants.END_DOCUMENT do
|
377
380
|
case event
|
378
|
-
when XMLStreamConstants.START_ELEMENT
|
381
|
+
when javax.xml.stream.XMLStreamConstants.START_ELEMENT
|
379
382
|
start_element_namespace(@parser.getLocalName, [], nil, @parser.getNamespaceURI, nil)
|
380
|
-
when XMLStreamConstants.END_ELEMENT
|
383
|
+
when javax.xml.stream.XMLStreamConstants.END_ELEMENT
|
381
384
|
end_element_namespace(@parser.getLocalName, @parser.getPrefix, @parser.getNamespaceURI)
|
382
|
-
when XMLStreamConstants.CHARACTERS
|
385
|
+
when javax.xml.stream.XMLStreamConstants.CHARACTERS
|
383
386
|
characters(@parser.getText)
|
384
387
|
end
|
385
388
|
end
|
data/lib/marc/xmlreader.rb
CHANGED
@@ -69,6 +69,7 @@ module MARC
|
|
69
69
|
raise ArgumentError, "jstax only available under jruby" unless defined? JRUBY_VERSION
|
70
70
|
extend JRubySTAXReader
|
71
71
|
when 'libxml' then extend LibXMLReader
|
72
|
+
raise ArgumentError, "libxml not available under jruby" if defined? JRUBY_VERSION
|
72
73
|
end
|
73
74
|
end
|
74
75
|
|
@@ -95,14 +96,9 @@ module MARC
|
|
95
96
|
# Returns the value of the best available parser
|
96
97
|
def self.best_available
|
97
98
|
parser = nil
|
98
|
-
jruby = [
|
99
|
+
jruby = [USE_NOKOGIRI, USE_JSTAX, USE_JREXML]
|
99
100
|
ruby = [USE_NOKOGIRI, USE_LIBXML]
|
100
101
|
if defined? JRUBY_VERSION
|
101
|
-
begin
|
102
|
-
java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
|
103
|
-
parser = USE_JSTAX
|
104
|
-
rescue java.lang.ClassNotFoundException
|
105
|
-
end
|
106
102
|
unless parser
|
107
103
|
begin
|
108
104
|
require 'nokogiri'
|
@@ -110,6 +106,14 @@ module MARC
|
|
110
106
|
rescue LoadError
|
111
107
|
end
|
112
108
|
end
|
109
|
+
unless parser
|
110
|
+
begin
|
111
|
+
# try to find the class, so we throw an error if not found
|
112
|
+
java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
|
113
|
+
parser = USE_JSTAX
|
114
|
+
rescue java.lang.ClassNotFoundException
|
115
|
+
end
|
116
|
+
end
|
113
117
|
unless parser
|
114
118
|
begin
|
115
119
|
require 'jrexml'
|
@@ -123,13 +127,15 @@ module MARC
|
|
123
127
|
parser = USE_NOKOGIRI
|
124
128
|
rescue LoadError
|
125
129
|
end
|
126
|
-
unless
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
130
|
+
unless defined? JRUBY_VERSION
|
131
|
+
unless parser
|
132
|
+
begin
|
133
|
+
require 'xml'
|
134
|
+
parser = USE_LIBXML
|
135
|
+
rescue LoadError
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
133
139
|
end
|
134
140
|
parser = USE_REXML unless parser
|
135
141
|
parser
|
data/test/tc_parsers.rb
CHANGED
@@ -164,15 +164,6 @@ end
|
|
164
164
|
def choose_best_available_parser
|
165
165
|
parser_name = nil
|
166
166
|
parser = nil
|
167
|
-
if defined? JRUBY_VERSION
|
168
|
-
require 'java'
|
169
|
-
begin
|
170
|
-
java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
|
171
|
-
parser_name = "jstax"
|
172
|
-
parser = Java::ComSunOrgApacheXercesInternalImpl::XMLStreamReaderImpl
|
173
|
-
rescue java.lang.ClassNotFoundException
|
174
|
-
end
|
175
|
-
end
|
176
167
|
unless parser
|
177
168
|
begin
|
178
169
|
require 'nokogiri'
|
@@ -181,6 +172,17 @@ end
|
|
181
172
|
rescue LoadError
|
182
173
|
end
|
183
174
|
end
|
175
|
+
unless parser
|
176
|
+
if defined? JRUBY_VERSION
|
177
|
+
require 'java'
|
178
|
+
begin
|
179
|
+
java.lang.Class.forName("javax.xml.stream.XMLInputFactory")
|
180
|
+
parser_name = "jstax"
|
181
|
+
parser = Java::ComSunOrgApacheXercesInternalImpl::XMLStreamReaderImpl
|
182
|
+
rescue java.lang.ClassNotFoundException
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
184
186
|
unless parser
|
185
187
|
if !defined? JRUBY_VERSION
|
186
188
|
begin
|
data/test/tc_xml.rb
CHANGED
@@ -10,10 +10,12 @@ class XMLTest < Test::Unit::TestCase
|
|
10
10
|
@parsers << :nokogiri
|
11
11
|
rescue LoadError
|
12
12
|
end
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
unless defined? JRUBY_VERSION
|
14
|
+
begin
|
15
|
+
require 'xml'
|
16
|
+
@parsers << :libxml
|
17
|
+
rescue LoadError
|
18
|
+
end
|
17
19
|
end
|
18
20
|
if defined? JRUBY_VERSION
|
19
21
|
begin
|
metadata
CHANGED
@@ -1,15 +1,10 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: marc
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 5
|
9
|
-
- 0
|
10
|
-
version: 0.5.0
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.5.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Kevin Clarke
|
14
9
|
- Bill Dueber
|
15
10
|
- William Groppe
|
@@ -18,104 +13,83 @@ authors:
|
|
18
13
|
autorequire: marc
|
19
14
|
bindir: bin
|
20
15
|
cert_chain: []
|
21
|
-
|
22
|
-
date: 2012-05-07 00:00:00 Z
|
16
|
+
date: 2013-07-11 00:00:00.000000000 Z
|
23
17
|
dependencies: []
|
24
|
-
|
25
|
-
description:
|
18
|
+
description:
|
26
19
|
email: ehs@pobox.com
|
27
20
|
executables: []
|
28
|
-
|
29
21
|
extensions: []
|
30
|
-
|
31
22
|
extra_rdoc_files: []
|
32
|
-
|
33
|
-
|
34
|
-
- lib/marc/
|
23
|
+
files:
|
24
|
+
- lib/marc.rb
|
25
|
+
- lib/marc/constants.rb
|
35
26
|
- lib/marc/controlfield.rb
|
36
|
-
- lib/marc/reader.rb
|
37
|
-
- lib/marc/dublincore.rb
|
38
|
-
- lib/marc/xmlwriter.rb
|
39
27
|
- lib/marc/datafield.rb
|
40
|
-
- lib/marc/
|
28
|
+
- lib/marc/dublincore.rb
|
41
29
|
- lib/marc/exception.rb
|
30
|
+
- lib/marc/reader.rb
|
31
|
+
- lib/marc/record.rb
|
32
|
+
- lib/marc/subfield.rb
|
33
|
+
- lib/marc/version.rb
|
42
34
|
- lib/marc/writer.rb
|
35
|
+
- lib/marc/xml_parsers.rb
|
43
36
|
- lib/marc/xmlreader.rb
|
44
|
-
- lib/marc/
|
45
|
-
-
|
46
|
-
- lib/marc/subfield.rb
|
47
|
-
- lib/marc.rb
|
48
|
-
- test/one.dat
|
49
|
-
- test/one.xml
|
37
|
+
- lib/marc/xmlwriter.rb
|
38
|
+
- test/batch.dat
|
50
39
|
- test/batch.xml
|
51
40
|
- test/cp866_multirecord.marc
|
52
|
-
- test/
|
41
|
+
- test/cp866_unimarc.marc
|
42
|
+
- test/marc8_accented_chars.marc
|
53
43
|
- test/no-leading-zero.xml
|
54
|
-
- test/tc_record.rb
|
55
|
-
- test/tc_parsers.rb
|
56
|
-
- test/tc_hash.rb
|
57
|
-
- test/tc_subfield.rb
|
58
44
|
- test/non-numeric.dat
|
59
|
-
- test/
|
60
|
-
- test/
|
61
|
-
- test/
|
45
|
+
- test/non-numeric.xml
|
46
|
+
- test/one.dat
|
47
|
+
- test/one.xml
|
48
|
+
- test/random_tag_order.dat
|
49
|
+
- test/random_tag_order2.dat
|
62
50
|
- test/tc_controlfield.rb
|
63
|
-
- test/tc_bare_ruby_strings.rb
|
64
|
-
- test/tc_marchash.rb
|
65
|
-
- test/marc8_accented_chars.marc
|
66
|
-
- test/cp866_unimarc.marc
|
67
51
|
- test/tc_datafield.rb
|
68
|
-
- test/tc_reader_char_encodings.rb
|
69
|
-
- test/utf8_with_bad_bytes.marc
|
70
|
-
- test/utf8.marc
|
71
|
-
- test/bare_cp866.txt
|
72
52
|
- test/tc_dublincore.rb
|
73
|
-
- test/
|
53
|
+
- test/tc_hash.rb
|
54
|
+
- test/tc_marchash.rb
|
55
|
+
- test/tc_parsers.rb
|
56
|
+
- test/tc_reader.rb
|
57
|
+
- test/tc_reader_char_encodings.rb
|
58
|
+
- test/tc_record.rb
|
59
|
+
- test/tc_subfield.rb
|
74
60
|
- test/tc_writer.rb
|
75
61
|
- test/tc_xml.rb
|
76
|
-
- test/non-numeric.xml
|
77
|
-
- test/random_tag_order2.dat
|
78
62
|
- test/ts_marc.rb
|
79
|
-
- test/
|
80
|
-
- test/
|
63
|
+
- test/utf8.marc
|
64
|
+
- test/utf8_multirecord.marc
|
65
|
+
- test/utf8_with_bad_bytes.marc
|
81
66
|
- Rakefile
|
82
67
|
- README.md
|
83
68
|
- Changes
|
84
69
|
- LICENSE
|
85
70
|
homepage: https://github.com/ruby-marc/ruby-marc/
|
86
71
|
licenses: []
|
87
|
-
|
88
|
-
post_install_message:
|
72
|
+
post_install_message:
|
89
73
|
rdoc_options: []
|
90
|
-
|
91
|
-
require_paths:
|
74
|
+
require_paths:
|
92
75
|
- lib
|
93
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
- !ruby/object:Gem::Version
|
98
|
-
hash: 59
|
99
|
-
segments:
|
100
|
-
- 1
|
101
|
-
- 8
|
102
|
-
- 6
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
103
80
|
version: 1.8.6
|
104
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
81
|
none: false
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
version: "0"
|
82
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
83
|
+
requirements:
|
84
|
+
- - '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
87
|
+
none: false
|
113
88
|
requirements: []
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
signing_key:
|
89
|
+
rubyforge_project:
|
90
|
+
rubygems_version: 1.8.24
|
91
|
+
signing_key:
|
118
92
|
specification_version: 3
|
119
93
|
summary: A ruby library for working with Machine Readable Cataloging
|
120
|
-
test_files:
|
94
|
+
test_files:
|
121
95
|
- test/ts_marc.rb
|
data/test/bare_cp866.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
����㭠�. ������ ��� ����⬠��
|
data/test/jruby_bad_transcode.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
# 1.9.3p0 :005 > 0x8D.chr.force_encoding("cp866").encode("UTF-8")
|
4
|
-
utf8 = "Н".force_encoding("UTF-8")
|
5
|
-
|
6
|
-
puts "There's a cyrillic letter that looks kinda like a capital H. Here's what it looks like in unicode: Н"
|
7
|
-
|
8
|
-
puts "In unicode, that's byte array: " + utf8.bytes.to_a.inspect
|
9
|
-
|
10
|
-
puts "We're gonna use String#encode to convert it to an IBM866 encoding, also known as cp866, an encoding sometimes used in Russia."
|
11
|
-
|
12
|
-
|
13
|
-
puts " `utf8.encode(\"IBM866\")`"
|
14
|
-
|
15
|
-
cp866 = utf8.encode("IBM866")
|
16
|
-
puts cp866.bytes.to_a.inspect
|
17
|
-
|
18
|
-
exit
|
19
|
-
|
20
|
-
puts
|
21
|
-
puts "In cp866, the actual bytes are: #{cp866_phrase.bytes.to_a.inspect}"
|
22
|
-
puts
|
23
|
-
|
24
|
-
puts "We're going to write the cp866 string to disk, using binary:binary to try and make sure we get the bytes to disk without transcoding."
|
25
|
-
|
26
|
-
write = File.open("test_cp866.txt", "w", :internal_encoding => "binary", :external_encoding => "binary")
|
27
|
-
write.puts cp866_phrase
|
28
|
-
write.close
|
29
|
-
puts
|
30
|
-
|
31
|
-
puts "Now we're going to read it in with a File object with external_encoding set to IBM866, but no internal_encoding set."
|
32
|
-
|
33
|
-
puts
|
34
|
-
puts "Make sure we have no default internal_encoding: " + Encoding.default_internal.nil?.inspect
|
35
|
-
|
36
|
-
read = File.open("test_cp866.txt", :external_encoding => "cp866")
|
37
|
-
puts
|
38
|
-
puts "Our ruby file object should have external_encoding of IBM866: " + read.external_encoding.inspect
|
39
|
-
puts " and internal_encoding nil: " + read.internal_encoding.inspect
|
40
|
-
|
41
|
-
puts
|
42
|
-
|
43
|
-
read_in_string = read.read
|
44
|
-
read.close
|
45
|
-
|
46
|
-
puts "The encoding of the string we read in should be IBM866: " + (read_in_string.encoding.name == "IBM866").inspect
|
47
|
-
|
48
|
-
puts
|
49
|
-
puts "And the bytes should be the very same bytes we wrote out (which are valid cp866) " + (read_in_string.bytes.to_a[0,3] == [140, 165, 166]).inspect + " (#{read_in_string.bytes.to_a})"
|
50
|
-
|
51
|
-
puts "The above is TRUE in MRI 1.9.3, but FALSE in jruby "
|
52
|
-
|
data/test/jruby_just_string.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
# encoding: binary
|
2
|
-
|
3
|
-
# jruby 1.6.7 (ruby-1.9.2-p312) (2012-02-22 3e82bc8) (Java HotSpot(TM) 64-Bit Server VM 1.6.0_20) [linux-amd64-java]
|
4
|
-
|
5
|
-
# There is a letter in cyrillic that looks kind of like a capital
|
6
|
-
# H. In the cp866 encoding (http://en.wikipedia.org/wiki/Code_page_866)
|
7
|
-
# it's represented by "\x8D" which is decimal 141.
|
8
|
-
#
|
9
|
-
# In ruby 1.9, it _ought_ to be possible to have those bytes
|
10
|
-
# in a string, and tell ruby it's cp866.
|
11
|
-
|
12
|
-
cp866 = "\x8D".force_encoding("IBM866")
|
13
|
-
|
14
|
-
# in MRI 1.9.3, if we inspect that, we get "\x8D", just like we expect.
|
15
|
-
# and if we look at #bytes.to_a, we get [141], just like we expect.
|
16
|
-
puts cp866.inspect
|
17
|
-
puts cp866.bytes.to_a.inspect
|
18
|
-
# However, in jruby if we #inspect instead of getting "\x8D",
|
19
|
-
# we get "\u008D" -- this is wrong, it's NOT that unicode codepoint.
|
20
|
-
# In jruby, bytes.to_a.inspect is still [141], it hasn't changed
|
21
|
-
# the bytes, but it's confused about what's going on.
|
22
|
-
|
23
|
-
# We see this encoding confusion demonstrated if we try
|
24
|
-
# a String#encode.
|
25
|
-
#
|
26
|
-
# MRI 1.9.3 is perfectly capable of transcoding this to UTF-8
|
27
|
-
|
28
|
-
utf8 = cp866.encode("UTF-8")
|
29
|
-
puts utf8.inspect # => in MRI displays cyrillic in terminal no prob
|
30
|
-
puts utf8.bytes.to_a.inspect # => in MRI [208, 157], proper bytes for utf8
|
31
|
-
|
32
|
-
# In jruby, puts utf8.inspect displays "\u008D", and
|
33
|
-
# utf8.bytes.to_a.inspect is [194, 141]. I don't know where the
|
34
|
-
# 191 came from, but it has NOT succesfully transcoded to utf8.
|
35
|
-
|
36
|
-
# In other cases, the #encode will actually raise an illegal byte
|
37
|
-
# exception if the original bytes were not legal for UTF8 (or UTF16?) --
|
38
|
-
# but the original bytes were not meant to be considered unicode at all.
|
39
|
-
|
@@ -1,43 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
|
3
|
-
class TestBareRubyStrings < Test::Unit::TestCase
|
4
|
-
|
5
|
-
# The file bare_cp866.txt has in it a phrase encoded in cp866,
|
6
|
-
# that if it were translated to utf8 would be:
|
7
|
-
# "Междунар. новости мира пластмасс\n"
|
8
|
-
#
|
9
|
-
# The first few bytes of that in utf8 are:
|
10
|
-
# "\xD0\x9C\xD0\xB5"
|
11
|
-
#
|
12
|
-
# In cp866 as it is on disk, it's first few bytes are "\x8C\xA5"
|
13
|
-
|
14
|
-
def test_read_cp866_with_external_encoding
|
15
|
-
return
|
16
|
-
file = File.open("test/bare_cp866.txt", "r:cp866")
|
17
|
-
string = file.read
|
18
|
-
|
19
|
-
assert_equal "IBM866", string.encoding.name
|
20
|
-
|
21
|
-
cp866_binary = string.dup.force_encoding("binary")
|
22
|
-
assert cp866_binary.start_with?( "\x8C\xA5".force_encoding("binary") )
|
23
|
-
|
24
|
-
transcoded = string.encode("UTF-8")
|
25
|
-
assert_equal "UTF-8", transcoded.encoding.name
|
26
|
-
|
27
|
-
utf8_binary = transcoded.dup.force_encoding("binary")
|
28
|
-
|
29
|
-
assert utf8_binary.start_with?( "\xD0\x9C\xD0\xB5".force_encoding("binary"))
|
30
|
-
end
|
31
|
-
|
32
|
-
def test_read_cp866_binary_all_the_way
|
33
|
-
# tell ruby to treat it as binary binary binary
|
34
|
-
file = File.open("test/bare_cp866.txt", :external_encoding => "binary", :internal_encoding => "binary")
|
35
|
-
|
36
|
-
string = file.read
|
37
|
-
|
38
|
-
# we should get the same bytes that were on disk, right?
|
39
|
-
assert string.start_with?( "\x8C\xA5".force_encoding("binary"))
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
end
|
data/test/test_cp866.txt
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
���
|